Skip to main content

Python script to scrape Google News headlines.

import ssl
import re
from urllib.request import urlopen

##
# Google news headline scraper.
#
# Original: https://github.com/geekcomputers/Python/blob/master/Google_News.py
#
# Setup:
#   Install dependencies using pip:
#     $ pip3 install BeautifulSoup4
#     $ pip3 install lxml
#
# Usage:
#   Run the script:
#     $ python google_news.py
##

from bs4 import BeautifulSoup as soup


RE_HTML_TAG = re.compile('<.*?>')


def normalize_spaces(text):
    return text.replace('&nbsp;&nbsp;', '  ').strip()


def strip_html_tags(text):
    text = normalize_spaces(text)
    return re.sub(RE_HTML_TAG, '', text).strip()


def news(xml_news_url, counter):
    """
    Print select details from a html response containing xml

    @param xml_news_url: url to parse
    """

    context = ssl._create_unverified_context()
    Client = urlopen(xml_news_url, context=context)
    xml_page = Client.read()
    Client.close()

    soup_page = soup(xml_page, "xml")
    news_list = soup_page.findAll("item")

    i = 0  # counter to print n number of news items
    for news in news_list:
        print("-" * 65)
        print("{0}\n".format(news.title.text))
        print("Publish Date...: {0}".format(news.pubDate.text))
        print("Link...........: {0}".format(news.link.text))
        # print("Description......: " + strip_html_tags(news.description.text))
        print("-" * 65, "\n")

        if i == counter:
            break
        i = i + 1


# you can add google news 'xml' URL here for any country/category
news_url = "https://news.google.com/rss?hl=en-US&gl=US&ceid=US:en"
# sports_url = "https://news.google.com/news/rss/headlines/section/topic/SPORTS.en_in/Sports?ned=in&hl=en-IN&gl=IN"

# now call news function with any of these url or BOTH
news(news_url, 10)
# news(sports_url, 5)