Python script to scrape Google News headlines.
import ssl
import re
from urllib.request import urlopen
##
# Google news headline scraper.
#
# Original: https://github.com/geekcomputers/Python/blob/master/Google_News.py
#
# Setup:
# Install dependencies using pip:
# $ pip3 install BeautifulSoup4
# $ pip3 install lxml
#
# Usage:
# Run the script:
# $ python google_news.py
##
from bs4 import BeautifulSoup as soup
RE_HTML_TAG = re.compile('<.*?>')
def normalize_spaces(text):
return text.replace(' ', ' ').strip()
def strip_html_tags(text):
text = normalize_spaces(text)
return re.sub(RE_HTML_TAG, '', text).strip()
def news(xml_news_url, counter):
"""
Print select details from a html response containing xml
@param xml_news_url: url to parse
"""
context = ssl._create_unverified_context()
Client = urlopen(xml_news_url, context=context)
xml_page = Client.read()
Client.close()
soup_page = soup(xml_page, "xml")
news_list = soup_page.findAll("item")
i = 0 # counter to print n number of news items
for news in news_list:
print("-" * 65)
print("{0}\n".format(news.title.text))
print("Publish Date...: {0}".format(news.pubDate.text))
print("Link...........: {0}".format(news.link.text))
# print("Description......: " + strip_html_tags(news.description.text))
print("-" * 65, "\n")
if i == counter:
break
i = i + 1
# you can add google news 'xml' URL here for any country/category
news_url = "https://news.google.com/rss?hl=en-US&gl=US&ceid=US:en"
# sports_url = "https://news.google.com/news/rss/headlines/section/topic/SPORTS.en_in/Sports?ned=in&hl=en-IN&gl=IN"
# now call news function with any of these url or BOTH
news(news_url, 10)
# news(sports_url, 5)