Useful Python SEO Functions

This post is part of the complete Guide on Python for SEO

Here is a collection of useful Python functions that SEOs can use to do things like making http requests.

Navigation Show

Generate Fake User-Agent

from fake_useragent import UserAgent
ua = UserAgent()
# ua.chrome

header = {'User-Agent':str(ua.random)}
url = 'https://www.example.com'

response = requests.get(url, headers = header)
BeautifulSoup(response.text, 'html.parser')

Check Robots.txt and Fetch With BeautifulSoup

# Import Libraries
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from reppy.robots import Robots

# Set URLs
good_url = 'https://ca.indeed.com/Python-jobs'
bad_url = 'https://ca.indeed.com/jobs?q=Python&from=brws&nc=brws'

# Convert URL to extract robots.txt location
def get_bot_loc(url):
    domain_url = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
    bot_loc = domain_url + '/robots.txt'
    return bot_loc

# Parse the Robots.txt to see if URL is allowed
def robot_parser(url):
    bot_loc = get_bot_loc(url)
    parser = Robots.fetch(bot_loc)
    validation = parser.allowed(url, '*')
    return validation

# Fetch URL and return HTML if URL is allowed in Robots.txt
def fetch(url):
    validation = robot_parser(url)
    if validation:
        try:
            response = requests.get(url)
        except requests.exceptions.ConnectionError:
            print('Error: "%s" is not available!' % url) 

        content = BeautifulSoup(response.text, 'lxml') 
    else:
        content = '%s is blocked by robots.txt' % url
    return content


fetch(bad_url)
# 'https://ca.indeed.com/jobs?q=Python&from=brws&nc=brws is blocked by robots.txt' % url

fetch(good_url)
# HTML returned

Reverse DNS Lookup for Googlebot

Based on Koray Tuğberk GÜBÜR‘s post. Here is a quick way to make a reverse DNS lookup on a bunch of IPs.

import socket

filename = 'ip_list.txt'
with open(filename, 'r') as f:
    content = f.read()
    ls = content.split('\n')

nongooglebot = []
googlebot = []

for row in ls:
    try:
        reversed_dns = socket.gethostbyaddr(row)
        if str.__contains__(reversed_dns[0], 'googlebot.com') or str.__contains__(reversed_dns[0], 'google.com'):
            temp_ip = socket.gethostbyaddr(str(reversed_dns[2]).strip("'[]'"))
            if reversed_dns == temp_ip:
                googlebot.append((row,reversed_dns[0]))
            else:
                nongooglebot.append((row,reversed_dns[0]))
        else:
            nongooglebot.append((row,reversed_dns[0]))
    except:
        pass

Work With Directories

I use the os module as much as I use pandas python package, and that says a lot.

import os

os.getcwd() # Show working directory
os.listdir() # Show content of a directory
os.chdir('/New/path') # Change Working Directory
os.mkdir('/path') # Create new directory 
os.path.exists('/path') # Check if path exists. Avoid creating a duplicate folder
os.path.isfile('path/file.txt') # Check if file exists. Avoid creating a duplicate file

def create_project(directory):
    if not os.path.exists(directory):
        print('Create project: '+ directory)
        os.makedirs(directory)

Replace Special Characters

# Escape Special Characters
special_char = {'&':'&',"'":''','"':'"','>':'&gt','<':'<'}
df['col'] = df['col'].replace(special_char, regex=True)

Replace Part of String From Dataframe Using a List

import pandas as pd

# Create DF
words = ['bad_str','good_str']
var = ['blabla_','_blabla']
lst = []
for word in words:
    for i in range(5):
        name = var[0]+word+str(i)+var[1]
        lst.append(name)

df = pd.DataFrame(lst, columns=['Column'])

# Set Strings to Remove
bad_strs = ['bad_str0','bad_str1','bad_str2','bad_str3','bad_str4']

# Create Regex
regex = r'|'.join(bad_strs)

# Replace from DF
df['Column'].str.replace(regex,'', regex=True)

Write to a JSON file

def write_to_file(data):
    with open('data.json', 'w') as json_file:
        json.dump(data,json_file,indent=2)

Read Text File to a List

with open('filename.txt') as f:
    content = f.read()
    ls = content.split('\n')

Download Image from URL

def get_image(url):
    loc = os.getcwd()
    if url.endswith(('.jpg', '.jpeg', '.png', '.gif')):
        img = requests.get(url).content
        file_path = os.path.basename(url)
        with open(f'{loc}/{file_path}', 'wb') as f:
            f.write(img)

get_image('https://fr.wiktionary.org/wiki/loriquet_arc-en-ciel#/media/Fichier:Rainbow_lorikeet.jpg')

Convert Rows to a Database Format

Sometimes you extract data from Screaming Frog, but each extracted value is added to a new column. You might want to convert that to a database format.

data = pd.read_csv('your-crawl.csv')
db = pd.melt(data, id_vars='Address', value_vars=data.iloc[:,1:], var_name='Fields', value_name='Extraction').dropna()

Copy Data Into Clipboard

When you don’t want to export your DataFrame just to copy data. Useful for list crawls in Screaming Frog.

df.to_clipboard(index=False,header=False,sep=',')

#or

import pyperclip
pyperclip.copy('Copy to clipboard')

Apply Pareto

If you want to focus only on the important stuff, it might be useful to apply pareto to large data sets. All you need is a data set with one column for the textual data (like urls) and one column for numerical data (like sessions or clicks).

num_column = 'clicks'
data['cum_sum'] = data[num_column].cumsum(skipna = True)
data['cum_perc'] = 100 * data['cum_sum'] / data[num_column].sum()
pareto = data[data.cum_perc <= 80]
pareto

# URL                       clicks    cum_sum   cum_perc 
# /seo                         100        100       0.60  
# /python-for-seo               80        180       0.71  
# /google-search-console        60        240       0.76  
# /technical-seo                40        300       0.79

Find Casing of a List of Keywords

We will use is methods to find the type of string we have in a list of keywords. You can see all available methods using dir(str).

words = ['34','No Experience','CAT','no exp','no Exp','No exp',' ']
word_dict = dict.fromkeys(words)
meth = ['.isspace()','.isalpha()','.isdigit()','.islower()','.isspace()','.istitle()','.isupper()']
for word in word_dict:
    for i in range(len(meth)):
        if eval('"%s"' % word + str(meth[i])):
            word_dict[word] = meth[i]

word_dict

'''
Result
{'34': '.isdigit()',
 'No Experience': '.istitle()',
 'CAT': '.isupper()',
 'no exp': '.islower()',
 'no Exp': None,
 'No exp': None,
 ' ': '.isspace()'}
'''

Filter Dataframe Using Another Dataframe Columns

import pandas as pd

rows_to_rm = df1['col'].tolist()
df2 = df2[~df2['col'].isin(rows_to_rm)]

Read Access Log Files

If you want to try this code, just download a fake Access.log file from my Github account.

import pandas as pd
import re
import requests


# Get Log File Template from here 'https://github.com/jcchouinard/SEO-Projects/blob/master/access_log_20200602-101559.log'

logs = 'access_log_20200602-101559.log'
log_data = []
columns = ['ip','date','http_request','status_code','count','request_url','user_agent']
regex = '([(\d\.)]+) - - \[(.*?)\] "(.*?)" (\d+) (\d+) "(.*?)" "(.*?)"'

        
with open(logs, 'r') as f:
# Get components from each line of the log file into a structured dict
    for line in f:
        line = re.match(regex, line).groups()
        log_data.append(line)
        
log_data = pd.DataFrame(log_data, columns=columns)

log_data

Make a Request to Reddit API

Let’s make a simple request to Reddit API using requests. If you would like to do more complex things, you can also use Pushshift to query Reddit API.

This script gets the top post from today on r/python.

import requests
import json 

subreddit = 'python'
count = 1
timeframe = 'day' #hour, day, week, month, year, all
listing = 'top' # controversial, best, hot, new, random, rising, top

def get_reddit(subreddit,count):
    try:
        base_url = f'https://www.reddit.com/r/{subreddit}/{listing}.json?count={count}&t={timeframe}'
        request = requests.get(base_url, headers = {'User-agent': 'yourbot'})
    except:
        print('An Error Occured')
    return request.json()

top_post = get_reddit(subreddit,count)

if listing != 'random':
    title = top_post['data']['children'][0]['data']['title']
    url = top_post['data']['children'][0]['data']['url']
else:
    title = top_post[0]['data']['children'][0]['data']['title']
    url = top_post[0]['data']['children'][0]['data']['url']


print(f'{title}\n{url}')

Load Multiple CSV Into One Dataframe

import pandas as pd 
import glob

path = '/your/path' # use your path
all_files = glob.glob(path + "/*_gsc_data.csv")

data = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    data.append(df)

merged_df = pd.concat(data, axis=0, ignore_index=True)

Encode Special Characters

from urllib.parse import quote, unquote

quote('/jobs-in-Melbourne,-VIC')
# '/jobs-in-Melbourne%2C-VIC'

unquote('/jobs-in-Melbourne%2C-VIC')
'/jobs-in-Melbourne,-VIC'

Python’s default version to Python 3 on MAC OS X

If you have installed Python without Anaconda on Mac, you will need to use python3 and pip3 commands instead of the obvious python and pip commands.

This is true when you type python --version in Terminal and that the result is Python 2.7.16

In Terminal, type:

vi ~/.bash_profile

It will open the vi editor where you can press I to enter edit mode. Add those lines and press esc. Write and quit the vi editor using :wq.

alias python = 'python3'
alias pip=pip3

Then restart the Terminal and the python version should be python3 by default.

$ python --version

Python 3.8.3

Work with Dates in Python

Convert Dates to String and Strings to Date

This is a bit of a housekeeping code that I add whenever I use dates. With these two functions, I can easily convert the dates to string and the strings to datetime.

import datetime

# Convert date to a string with format YYYY-MM-DD
def date_to_str(date):
    if isinstance(date, datetime.datetime):
        date = datetime.datetime.strftime(date,'%Y-%m-%d')
    return date

# Convert date to a string with format YYYY-MM-DD
def str_to_date(date):
    if isinstance(date, str):
        date = datetime.datetime.strptime(date,'%Y-%m-%d')
    return date

List Dates Between Two Dates

For this function, you will need the date_to_str() and str_to_date() functions above.

The list_dates() function will return a list of dates between the dates that we choose.

import datetime

def list_dates(startDate,endDate):
    start_date = str_to_date(startDate)
    end_date = str_to_date(endDate)
    delta = end_date - start_date
    days = []
    for i in range(delta.days + 1):
        day = start_date + datetime.timedelta(days=i)
        days.append(date_to_str(day))
    return days

Check Last Day of the Month

import datetime

def last_day_of_month(date):
    next_month = str_to_date(date).replace(day=28) + datetime.timedelta(days=4)  # this will never fail
    return next_month - datetime.timedelta(days=next_month.day)

Check Time to Run a Function

import time

def time_wait():
    time.sleep(3)

start_time = time.time()
time_wait()
end_time = time.time()
print("--- %s seconds ---" % (end_time - start_time))

Efficiently get Status Codes from List of URLs

pip install requests-futures

import pandas as pd
from requests_futures.sessions import FuturesSession

def get_request(url):
    session = FuturesSession(max_workers=5)
    return session.head(url)

def get_status_code(r):
    return r.result().status_code

if __name__ == "__main__":
    urls = ['url1', 'url2', 'url3']
    df = pd.DataFrame({"url": urls})
    df["status_code"] = df["url"].apply(get_request).apply(get_status_code)

Read Massive CSV Files with Pandas Chunk

filename = 'data.csv' 

def chunk_read_csv(filename,size):
    chunks = pd.read_csv(filename, chunksize=size)

    lsChunk = []  # Initialize List to store chunks

    for chunk in chunks:  
        print(chunk.index) # Print the Range Index
        lsChunk.append(chunk) # append the chunk to list
    
    print('Chunk Parsing: Done')

    # Concat the chunk list into dataframe 
    df = pd.concat(chunk_list)
    print('DF is Ready')
    
    return df

chunk_read_csv(filename, 1000000)

5/5 - (2 votes)

Jean-Christophe Chouinard

SEO Strategist at Tripadvisor, ex- Seek (Melbourne, Australia). Specialized in technical SEO. Writer in Python, Information Retrieval, SEO and machine learning. Guest author at SearchEngineJournal, SearchEngineLand and OnCrawl.