In this post, I will show you how to parse a Robots.txt file and save it to Pandas Dataframe using Python.
The full code is available at the end of this blog post and on Github.
We will parse the robots.txt file using os
, pandas
and urlparse
. Make sure that you install those packages using either pip
or conda
.
How to Save Robots.txt to a Pandas Dataframe
- Read Robots.txt File
- Create an empty dictionary
- Parse the Robots.txt into the dictionary
- Create a dataframe with the dictionary
- Execute the Script
First, let’s import the libraries.
import pandas as pd
import os
from urllib.parse import urlparse
1. Read Robots.txt File
These two functions will be used to get URL the location of the robots.txt file, open it and read it., using a URL for the domain that we want to get it from.
def get_robots_url(url):
domain_url = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
robots_url = domain_url + '/robots.txt'
return robots_url
def read_robots_txt(url):
robot_url = get_robots_url(url)
robot_file = os.popen(f'curl {robot_url}').read()
return robot_file
These functions can be called using read_robots_txt('https://www.example.com/)
, and it will return a text file with the content of the Robots.txt to parse.
2. Create an empty dictionary
Initialize an empty dictionary in which we are going to add the parsed line later.
def initialize_dict(url):
robot_file = read_robots_txt(url)
result_data_set = {'User-Agent':{}}
for line in robot_file.split("\n"):
if line.startswith('User-Agent'):
result_data_set['User-Agent'].update({line.split(':')[1].strip():{}})
keys = []
for key in result_data_set['User-Agent'].keys():
keys.append(key)
return result_data_set, keys, robot_file
The initialize_dict(url)
function will be called later from the next function: parse_robot(url)
.
3. Parse the Robots.txt into the dictionary
Parse each line of the Robots.txt file and append it to the dictionary.
def parse_robot(url):
idict = initialize_dict(url)
result_data_set = idict[0]
keys = idict[1]
robot_file = idict[2]
print_flag = False
for i in range(len(keys)):
if i <= len(keys)-2:
end_str = keys[i+1]
else:
end_str = 'We are done'
result_data_set['User-agent'][keys[i]]['Disallow'] = []
result_data_set['User-agent'][keys[i]]['Allow'] = []
for line in robot_file.split("\n"):
if end_str in line:
print_flag = False
elif keys[i] in line:
print_flag = True
elif print_flag:
if line.startswith('Disallow') or line.startswith('Allow'):
status = line.split(':')[0].strip()
val = line.split(':')[1].strip()
result_data_set['User-agent'][keys[i]][status].append(val)
return result_data_set
The
function will be called to create the dataset to convert to a Dataframe in the next function: parse_robot(url)
robots_to_df(url)
.
4. Create a Dataframe with the Dictionary
Create a Dataframe from the dictionary that you added the parsed lines of the robots.txt file.
def robots_to_df(url):
result_data_set = parse_robot(url)
ls = {'User-agent':[],'Status':[],'Pattern':[]}
for k,v in result_data_set.items():
for v in result_data_set[k]:
for key,value in result_data_set[k][v].items():
for value in result_data_set[k][v][key]:
ls['User-agent'].append(v)
ls['Status'].append(key)
ls['Pattern'].append(value)
robots_df = pd.DataFrame.from_dict(ls)
return robots_df
5. Execute the Main Function
Now that all the functions are defined. Execute the function using the robots_to_df(url)
.
url = 'https://www.example.com/'
robots_to_df(url)
That’s it! If you have came up this far, please share!
Full Code
import pandas as pd
import os
from urllib.parse import urlparse
ua = 'User-agent'
def get_robots_url(url):
domain_url = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
robots_url = domain_url + '/robots.txt'
return robots_url
def read_robots_txt(url):
robot_url = get_robots_url(url)
robot_file = os.popen(f'curl {robot_url}').read()
return robot_file
def initialize_dict(url):
robot_file = read_robots_txt(url)
result_data_set = {ua:{}}
for line in robot_file.split("\n"):
if line.startswith(ua):
result_data_set[ua].update({line.split(':')[1].strip():{}})
keys = []
for key in result_data_set[ua].keys():
keys.append(key)
return result_data_set, keys, robot_file
def parse_robot(url):
idict = initialize_dict(url)
result_data_set = idict[0]
keys = idict[1]
robot_file = idict[2]
print_flag = False
for i in range(len(keys)):
if i <= len(keys)-2:
end_str = keys[i+1]
else:
end_str = 'We are done'
result_data_set[ua][keys[i]]['Disallow'] = []
result_data_set[ua][keys[i]]['Allow'] = []
for line in robot_file.split("\n"):
if end_str in line:
print_flag = False
elif keys[i] in line:
print_flag = True
elif print_flag:
if line.startswith('Disallow') or line.startswith('Allow'):
status = line.split(':')[0].strip()
val = line.split(':')[1].strip()
result_data_set[ua][keys[i]][status].append(val)
return result_data_set
def robots_to_df(url):
result_data_set = parse_robot(url)
ls = {ua:[],'Status':[],'Pattern':[]}
for k,v in result_data_set.items():
for v in result_data_set[k]:
for key,value in result_data_set[k][v].items():
for value in result_data_set[k][v][key]:
ls[ua].append(v)
ls['Status'].append(key)
ls['Pattern'].append(value)
robots_df = pd.DataFrame.from_dict(ls)
return robots_df
robots_to_df(url)
This is the end of the tutorial on how to parse a robots.txt file to a Dataframe using Python.
SEO Strategist at Tripadvisor, ex- Seek (Melbourne, Australia). Specialized in technical SEO. Writer in Python, Information Retrieval, SEO and machine learning. Guest author at SearchEngineJournal, SearchEngineLand and OnCrawl.