Read RSS Feed with Python and Beautiful Soup

You can read RSS feeds with Python using requests and BeautifulSoup (bs4).

Getting Started

Install Required Libraries

In your Terminal, install the lxml, requests and beautifulsoup4 libraries.

$ pip install lxml requests beautifulsoup4

Import the Libraries

from bs4 import BeautifulSoup
import requests

Create the Class

The ReadRSS class creates an object that will:

Join the Newsletter

    1. Fetch the given url using requests;
    2. Parse the XML with BeautifulSoup
    3. Creates a list of dictionaries with article data
    4. Create lists of links, titles, description and publication dates from the list of dictionaries.
    class ReadRss:
    
        def __init__(self, rss_url, headers):
    
            self.url = rss_url
            try:
                self.r = requests.get(rss_url, headers=self.HEADERS)
                self.status_code = self.r.status_code
            except Exception as e:
                print('Error fetching the URL: ', rss_url)
                print(e)
            try:    
                self.soup = BeautifulSoup(self.r.text, 'lxml')
            except Exception as e:
                print('Could not parse the xml: ', self.url)
                print(e)
            self.articles = self.soup.findAll('item')
            self.articles_dicts = [{'title':a.find('title').text,'link':a.link.next_sibling.replace('\n','').replace('\t',''),'description':a.find('description').text,'pubdate':a.find('pubdate').text} for a in self.articles]
            self.urls = [d['link'] for d in self.articles_dicts if 'link' in d]
            self.titles = [d['title'] for d in self.articles_dicts if 'title' in d]
            self.descriptions = [d['description'] for d in self.articles_dicts if 'description' in d]
            self.pub_dates = [d['pubdate'] for d in self.articles_dicts if 'pubdate' in d]
    
    

    Add Your User Agent

    Some servers block requests when they are not identified with a proper user agent.

    To find your user agent, you can search for it on Google.

    headers = {
                'User-Agent': 'your-user-agent-here'
            }
    

    Run the RSS Feed Reader

    Here the function will run the class for the given RSS url and print a list of URLs in the RSS.

    if __name__ == '__main__':
    
        feed = ReadRss('https://www.jcchouinard.com/author/jean-christophe-chouinard/feed/', headers)
        print(feed.urls)
    

    More Objects You Can Access

    # Get article data as a list of dicts
    feed.articles_dicts
    
    # Get list of urls in feed
    feed.urls
    
    # Show article titles
    feed.titles
    
    # Show descriptions
    feed.descriptions
    
    # Show publication dates
    feed.pub_dates
    
    

    Full Code

    from bs4 import BeautifulSoup
    import requests
    
    headers = {
                'User-Agent': 'your-user-agent-here'
            }
    
    class ReadRss:
    
        def __init__(self, rss_url, headers):
    
            self.url = rss_url
            self.headers = headers
            try:
                self.r = requests.get(rss_url, headers=self.headers)
                self.status_code = self.r.status_code
            except Exception as e:
                print('Error fetching the URL: ', rss_url)
                print(e)
            try:    
                self.soup = BeautifulSoup(self.r.text, 'lxml')
            except Exception as e:
                print('Could not parse the xml: ', self.url)
                print(e)
            self.articles = self.soup.findAll('item')
            self.articles_dicts = [{'title':a.find('title').text,'link':a.link.next_sibling.replace('\n','').replace('\t',''),'description':a.find('description').text,'pubdate':a.find('pubdate').text} for a in self.articles]
            self.urls = [d['link'] for d in self.articles_dicts if 'link' in d]
            self.titles = [d['title'] for d in self.articles_dicts if 'title' in d]
            self.descriptions = [d['description'] for d in self.articles_dicts if 'description' in d]
            self.pub_dates = [d['pubdate'] for d in self.articles_dicts if 'pubdate' in d]
    
    if __name__ == '__main__':
    
        feed = ReadRss('https://www.jcchouinard.com/author/jean-christophe-chouinard/feed/', headers)
        print(feed.urls)
    

    This is it for this tutorial on creating a rss feed reader with Python, Requests and BeautifulSoup.

    4.7/5 - (3 votes)