Read RSS Feed with Python and Beautiful Soup

You can read RSS feeds with Python using requests and BeautifulSoup (bs4).

Getting Started

Install Required Libraries

In your Terminal, install the lxml, requests and beautifulsoup4 libraries.

$ pip install lxml requests beautifulsoup4

Import the Libraries

from bs4 import BeautifulSoup
import requests

Create the Class

The ReadRSS class creates an object that will:

Fetch the given url using requests;
Parse the XML with BeautifulSoup
Creates a list of dictionaries with article data
Create lists of links, titles, description and publication dates from the list of dictionaries.

class ReadRss:

    def __init__(self, rss_url, headers):

        self.url = rss_url
        try:
            self.r = requests.get(rss_url, headers=self.HEADERS)
            self.status_code = self.r.status_code
        except Exception as e:
            print('Error fetching the URL: ', rss_url)
            print(e)
        try:    
            self.soup = BeautifulSoup(self.r.text, 'lxml')
        except Exception as e:
            print('Could not parse the xml: ', self.url)
            print(e)
        self.articles = self.soup.findAll('item')
        self.articles_dicts = [{'title':a.find('title').text,'link':a.link.next_sibling.replace('\n','').replace('\t',''),'description':a.find('description').text,'pubdate':a.find('pubdate').text} for a in self.articles]
        self.urls = [d['link'] for d in self.articles_dicts if 'link' in d]
        self.titles = [d['title'] for d in self.articles_dicts if 'title' in d]
        self.descriptions = [d['description'] for d in self.articles_dicts if 'description' in d]
        self.pub_dates = [d['pubdate'] for d in self.articles_dicts if 'pubdate' in d]

Add Your User Agent

Some servers block requests when they are not identified with a proper user agent.

To find your user agent, you can search for it on Google.

headers = {
            'User-Agent': 'your-user-agent-here'
        }

Run the RSS Feed Reader

Here the function will run the class for the given RSS url and print a list of URLs in the RSS.

if __name__ == '__main__':

    feed = ReadRss('https://www.jcchouinard.com/author/jean-christophe-chouinard/feed/', headers)
    print(feed.urls)

More Objects You Can Access

# Get article data as a list of dicts
feed.articles_dicts

# Get list of urls in feed
feed.urls

# Show article titles
feed.titles

# Show descriptions
feed.descriptions

# Show publication dates
feed.pub_dates

Full Code

from bs4 import BeautifulSoup
import requests

headers = {
            'User-Agent': 'your-user-agent-here'
        }

class ReadRss:

    def __init__(self, rss_url, headers):

        self.url = rss_url
        self.headers = headers
        try:
            self.r = requests.get(rss_url, headers=self.headers)
            self.status_code = self.r.status_code
        except Exception as e:
            print('Error fetching the URL: ', rss_url)
            print(e)
        try:    
            self.soup = BeautifulSoup(self.r.text, 'lxml')
        except Exception as e:
            print('Could not parse the xml: ', self.url)
            print(e)
        self.articles = self.soup.findAll('item')
        self.articles_dicts = [{'title':a.find('title').text,'link':a.link.next_sibling.replace('\n','').replace('\t',''),'description':a.find('description').text,'pubdate':a.find('pubdate').text} for a in self.articles]
        self.urls = [d['link'] for d in self.articles_dicts if 'link' in d]
        self.titles = [d['title'] for d in self.articles_dicts if 'title' in d]
        self.descriptions = [d['description'] for d in self.articles_dicts if 'description' in d]
        self.pub_dates = [d['pubdate'] for d in self.articles_dicts if 'pubdate' in d]

if __name__ == '__main__':

    feed = ReadRss('https://www.jcchouinard.com/author/jean-christophe-chouinard/feed/', headers)
    print(feed.urls)

This is it for this tutorial on creating a rss feed reader with Python, Requests and BeautifulSoup.

4.7/5 - (3 votes)

Jean-Christophe Chouinard

SEO Strategist at Tripadvisor, ex- Seek (Melbourne, Australia). Specialized in technical SEO. Writer in Python, Information Retrieval, SEO and machine learning. Guest author at SearchEngineJournal, SearchEngineLand and OnCrawl.