Here is a collection of useful Python functions that SEOs can use to do things like making http requests.
Generate Fake User-Agent
from fake_useragent import UserAgent
ua = UserAgent()
# ua.chrome
header = {'User-Agent':str(ua.random)}
url = 'https://www.example.com'
response = requests.get(url, headers = header)
BeautifulSoup(response.text, 'html.parser')
Check Robots.txt and Fetch With BeautifulSoup
# Import Libraries
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from reppy.robots import Robots
# Set URLs
good_url = 'https://ca.indeed.com/Python-jobs'
bad_url = 'https://ca.indeed.com/jobs?q=Python&from=brws&nc=brws'
# Convert URL to extract robots.txt location
def get_bot_loc(url):
domain_url = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
bot_loc = domain_url + '/robots.txt'
return bot_loc
# Parse the Robots.txt to see if URL is allowed
def robot_parser(url):
bot_loc = get_bot_loc(url)
parser = Robots.fetch(bot_loc)
validation = parser.allowed(url, '*')
return validation
# Fetch URL and return HTML if URL is allowed in Robots.txt
def fetch(url):
validation = robot_parser(url)
if validation:
try:
response = requests.get(url)
except requests.exceptions.ConnectionError:
print('Error: "%s" is not available!' % url)
content = BeautifulSoup(response.text, 'lxml')
else:
content = '%s is blocked by robots.txt' % url
return content
fetch(bad_url)
# 'https://ca.indeed.com/jobs?q=Python&from=brws&nc=brws is blocked by robots.txt' % url
fetch(good_url)
# HTML returned
Reverse DNS Lookup for Googlebot
Based on Koray Tuğberk GÜBÜR‘s post. Here is a quick way to make a reverse DNS lookup on a bunch of IPs.
import socket
filename = 'ip_list.txt'
with open(filename, 'r') as f:
content = f.read()
ls = content.split('\n')
nongooglebot = []
googlebot = []
for row in ls:
try:
reversed_dns = socket.gethostbyaddr(row)
if str.__contains__(reversed_dns[0], 'googlebot.com') or str.__contains__(reversed_dns[0], 'google.com'):
temp_ip = socket.gethostbyaddr(str(reversed_dns[2]).strip("'[]'"))
if reversed_dns == temp_ip:
googlebot.append((row,reversed_dns[0]))
else:
nongooglebot.append((row,reversed_dns[0]))
else:
nongooglebot.append((row,reversed_dns[0]))
except:
pass
Work With Directories
I use the os
module as much as I use pandas python package, and that says a lot.
import os
os.getcwd() # Show working directory
os.listdir() # Show content of a directory
os.chdir('/New/path') # Change Working Directory
os.mkdir('/path') # Create new directory
os.path.exists('/path') # Check if path exists. Avoid creating a duplicate folder
os.path.isfile('path/file.txt') # Check if file exists. Avoid creating a duplicate file
def create_project(directory):
if not os.path.exists(directory):
print('Create project: '+ directory)
os.makedirs(directory)
Replace Special Characters
# Escape Special Characters
special_char = {'&':'&',"'":''','"':'"','>':'>','<':'<'}
df['col'] = df['col'].replace(special_char, regex=True)
Replace Part of String From Dataframe Using a List
import pandas as pd
# Create DF
words = ['bad_str','good_str']
var = ['blabla_','_blabla']
lst = []
for word in words:
for i in range(5):
name = var[0]+word+str(i)+var[1]
lst.append(name)
df = pd.DataFrame(lst, columns=['Column'])
# Set Strings to Remove
bad_strs = ['bad_str0','bad_str1','bad_str2','bad_str3','bad_str4']
# Create Regex
regex = r'|'.join(bad_strs)
# Replace from DF
df['Column'].str.replace(regex,'', regex=True)
Write to a JSON file
def write_to_file(data):
with open('data.json', 'w') as json_file:
json.dump(data,json_file,indent=2)
Read Text File to a List
with open('filename.txt') as f:
content = f.read()
ls = content.split('\n')
Download Image from URL
def get_image(url):
loc = os.getcwd()
if url.endswith(('.jpg', '.jpeg', '.png', '.gif')):
img = requests.get(url).content
file_path = os.path.basename(url)
with open(f'{loc}/{file_path}', 'wb') as f:
f.write(img)
get_image('https://fr.wiktionary.org/wiki/loriquet_arc-en-ciel#/media/Fichier:Rainbow_lorikeet.jpg')
Convert Rows to a Database Format
Sometimes you extract data from Screaming Frog, but each extracted value is added to a new column. You might want to convert that to a database format.
data = pd.read_csv('your-crawl.csv')
db = pd.melt(data, id_vars='Address', value_vars=data.iloc[:,1:], var_name='Fields', value_name='Extraction').dropna()
Copy Data Into Clipboard
When you don’t want to export your DataFrame just to copy data. Useful for list crawls in Screaming Frog.
df.to_clipboard(index=False,header=False,sep=',')
#or
import pyperclip
pyperclip.copy('Copy to clipboard')
Apply Pareto
If you want to focus only on the important stuff, it might be useful to apply pareto to large data sets. All you need is a data set with one column for the textual data (like urls) and one column for numerical data (like sessions or clicks).
num_column = 'clicks'
data['cum_sum'] = data[num_column].cumsum(skipna = True)
data['cum_perc'] = 100 * data['cum_sum'] / data[num_column].sum()
pareto = data[data.cum_perc <= 80]
pareto
# URL clicks cum_sum cum_perc
# /seo 100 100 0.60
# /python-for-seo 80 180 0.71
# /google-search-console 60 240 0.76
# /technical-seo 40 300 0.79
Find Casing of a List of Keywords
We will use is methods to find the type of string we have in a list of keywords. You can see all available methods using dir(str)
.
words = ['34','No Experience','CAT','no exp','no Exp','No exp',' ']
word_dict = dict.fromkeys(words)
meth = ['.isspace()','.isalpha()','.isdigit()','.islower()','.isspace()','.istitle()','.isupper()']
for word in word_dict:
for i in range(len(meth)):
if eval('"%s"' % word + str(meth[i])):
word_dict[word] = meth[i]
word_dict
'''
Result
{'34': '.isdigit()',
'No Experience': '.istitle()',
'CAT': '.isupper()',
'no exp': '.islower()',
'no Exp': None,
'No exp': None,
' ': '.isspace()'}
'''
Filter Dataframe Using Another Dataframe Columns
import pandas as pd
rows_to_rm = df1['col'].tolist()
df2 = df2[~df2['col'].isin(rows_to_rm)]
Read Access Log Files
If you want to try this code, just download a fake Access.log file from my Github account.
import pandas as pd
import re
import requests
# Get Log File Template from here 'https://github.com/jcchouinard/SEO-Projects/blob/master/access_log_20200602-101559.log'
logs = 'access_log_20200602-101559.log'
log_data = []
columns = ['ip','date','http_request','status_code','count','request_url','user_agent']
regex = '([(\d\.)]+) - - \[(.*?)\] "(.*?)" (\d+) (\d+) "(.*?)" "(.*?)"'
with open(logs, 'r') as f:
# Get components from each line of the log file into a structured dict
for line in f:
line = re.match(regex, line).groups()
log_data.append(line)
log_data = pd.DataFrame(log_data, columns=columns)
log_data
Make a Request to Reddit API
Let’s make a simple request to Reddit API using requests
. If you would like to do more complex things, you can also use Pushshift to query Reddit API.
This script gets the top post from today on r/python
.
import requests
import json
subreddit = 'python'
count = 1
timeframe = 'day' #hour, day, week, month, year, all
listing = 'top' # controversial, best, hot, new, random, rising, top
def get_reddit(subreddit,count):
try:
base_url = f'https://www.reddit.com/r/{subreddit}/{listing}.json?count={count}&t={timeframe}'
request = requests.get(base_url, headers = {'User-agent': 'yourbot'})
except:
print('An Error Occured')
return request.json()
top_post = get_reddit(subreddit,count)
if listing != 'random':
title = top_post['data']['children'][0]['data']['title']
url = top_post['data']['children'][0]['data']['url']
else:
title = top_post[0]['data']['children'][0]['data']['title']
url = top_post[0]['data']['children'][0]['data']['url']
print(f'{title}\n{url}')
Load Multiple CSV Into One Dataframe
import pandas as pd
import glob
path = '/your/path' # use your path
all_files = glob.glob(path + "/*_gsc_data.csv")
data = []
for filename in all_files:
df = pd.read_csv(filename, index_col=None, header=0)
data.append(df)
merged_df = pd.concat(data, axis=0, ignore_index=True)
Encode Special Characters
from urllib.parse import quote, unquote
quote('/jobs-in-Melbourne,-VIC')
# '/jobs-in-Melbourne%2C-VIC'
unquote('/jobs-in-Melbourne%2C-VIC')
'/jobs-in-Melbourne,-VIC'
Python’s default version to Python 3 on MAC OS X
If you have installed Python without Anaconda on Mac, you will need to use python3
and pip3
commands instead of the obvious python
and pip
commands.
This is true when you type python --version
in Terminal and that the result is Python 2.7.16
In Terminal, type:
vi ~/.bash_profile
It will open the vi
editor where you can press I
to enter edit mode. Add those lines and press esc
. Write and quit the vi editor using :wq
.
alias python = 'python3'
alias pip=pip3
Then restart the Terminal and the python version should be python3 by default.
$ python --version
Python 3.8.3
Work with Dates in Python
Convert Dates to String and Strings to Date
This is a bit of a housekeeping code that I add whenever I use dates. With these two functions, I can easily convert the dates to string and the strings to datetime.
import datetime
# Convert date to a string with format YYYY-MM-DD
def date_to_str(date):
if isinstance(date, datetime.datetime):
date = datetime.datetime.strftime(date,'%Y-%m-%d')
return date
# Convert date to a string with format YYYY-MM-DD
def str_to_date(date):
if isinstance(date, str):
date = datetime.datetime.strptime(date,'%Y-%m-%d')
return date
List Dates Between Two Dates
For this function, you will need the date_to_str()
and str_to_date()
functions above.
The list_dates()
function will return a list of dates between the dates that we choose.
import datetime
def list_dates(startDate,endDate):
start_date = str_to_date(startDate)
end_date = str_to_date(endDate)
delta = end_date - start_date
days = []
for i in range(delta.days + 1):
day = start_date + datetime.timedelta(days=i)
days.append(date_to_str(day))
return days
Check Last Day of the Month
import datetime
def last_day_of_month(date):
next_month = str_to_date(date).replace(day=28) + datetime.timedelta(days=4) # this will never fail
return next_month - datetime.timedelta(days=next_month.day)
Check Time to Run a Function
import time
def time_wait():
time.sleep(3)
start_time = time.time()
time_wait()
end_time = time.time()
print("--- %s seconds ---" % (end_time - start_time))
Efficiently get Status Codes from List of URLs
pip install requests-futures
import pandas as pd
from requests_futures.sessions import FuturesSession
def get_request(url):
session = FuturesSession(max_workers=5)
return session.head(url)
def get_status_code(r):
return r.result().status_code
if __name__ == "__main__":
urls = ['url1', 'url2', 'url3']
df = pd.DataFrame({"url": urls})
df["status_code"] = df["url"].apply(get_request).apply(get_status_code)
Read Massive CSV Files with Pandas Chunk
filename = 'data.csv'
def chunk_read_csv(filename,size):
chunks = pd.read_csv(filename, chunksize=size)
lsChunk = [] # Initialize List to store chunks
for chunk in chunks:
print(chunk.index) # Print the Range Index
lsChunk.append(chunk) # append the chunk to list
print('Chunk Parsing: Done')
# Concat the chunk list into dataframe
df = pd.concat(chunk_list)
print('DF is Ready')
return df
chunk_read_csv(filename, 1000000)
SEO Strategist at Tripadvisor, ex- Seek (Melbourne, Australia). Specialized in technical SEO. Writer in Python, Information Retrieval, SEO and machine learning. Guest author at SearchEngineJournal, SearchEngineLand and OnCrawl.