Useful Python SEO Functions

This post is part of the complete Guide on Python for SEO

Here is a collection of useful Python functions that SEOs can use to do things like making http requests.

Generate Fake User-Agent

from fake_useragent import UserAgent
ua = UserAgent()
# ua.chrome

header = {'User-Agent':str(ua.random)}
url = 'https://www.example.com'

response = requests.get(url, headers = header)
BeautifulSoup(response.text, 'html.parser') 

Check Robots.txt and Fetch With BeautifulSoup

# Import Libraries
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from reppy.robots import Robots

# Set URLs
good_url = 'https://ca.indeed.com/Python-jobs'
bad_url = 'https://ca.indeed.com/jobs?q=Python&from=brws&nc=brws'

# Convert URL to extract robots.txt location
def get_bot_loc(url):
    domain_url = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
    bot_loc = domain_url + '/robots.txt'
    return bot_loc

# Parse the Robots.txt to see if URL is allowed
def robot_parser(url):
    bot_loc = get_bot_loc(url)
    parser = Robots.fetch(bot_loc)
    validation = parser.allowed(url, '*')
    return validation

# Fetch URL and return HTML if URL is allowed in Robots.txt
def fetch(url):
    validation = robot_parser(url)
    if validation:
        try:
            response = requests.get(url)
        except requests.exceptions.ConnectionError:
            print('Error: "%s" is not available!' % url) 

        content = BeautifulSoup(response.text, 'lxml') 
    else:
        content = '%s is blocked by robots.txt' % url
    return content


fetch(bad_url)
# 'https://ca.indeed.com/jobs?q=Python&from=brws&nc=brws is blocked by robots.txt' % url

fetch(good_url)
# HTML returned

Reverse DNS Lookup for Googlebot

Based on Koray Tuğberk GÜBÜR‘s post. Here is a quick way to make a reverse DNS lookup on a bunch of IPs.

import socket

filename = 'ip_list.txt'
with open(filename, 'r') as f:
    content = f.read()
    ls = content.split('\n')

nongooglebot = []
googlebot = []

for row in ls:
    try:
        reversed_dns = socket.gethostbyaddr(row)
        if str.__contains__(reversed_dns[0], 'googlebot.com') or str.__contains__(reversed_dns[0], 'google.com'):
            temp_ip = socket.gethostbyaddr(str(reversed_dns[2]).strip("'[]'"))
            if reversed_dns == temp_ip:
                googlebot.append((row,reversed_dns[0]))
            else:
                nongooglebot.append((row,reversed_dns[0]))
        else:
            nongooglebot.append((row,reversed_dns[0]))
    except:
        pass

Work With Directories

I use the os module as much as I use pandas python package, and that says a lot.

Join the Newsletter

    import os
    
    os.getcwd() # Show working directory
    os.listdir() # Show content of a directory
    os.chdir('/New/path') # Change Working Directory
    os.mkdir('/path') # Create new directory 
    os.path.exists('/path') # Check if path exists. Avoid creating a duplicate folder
    os.path.isfile('path/file.txt') # Check if file exists. Avoid creating a duplicate file
    
    def create_project(directory):
        if not os.path.exists(directory):
            print('Create project: '+ directory)
            os.makedirs(directory)
    

    Replace Special Characters

    # Escape Special Characters
    special_char = {'&':'&',"'":''','"':'"','>':'&gt','<':'<'}
    df['col'] = df['col'].replace(special_char, regex=True)
    

    Replace Part of String From Dataframe Using a List

    import pandas as pd
    
    # Create DF
    words = ['bad_str','good_str']
    var = ['blabla_','_blabla']
    lst = []
    for word in words:
        for i in range(5):
            name = var[0]+word+str(i)+var[1]
            lst.append(name)
    
    df = pd.DataFrame(lst, columns=['Column'])
    
    # Set Strings to Remove
    bad_strs = ['bad_str0','bad_str1','bad_str2','bad_str3','bad_str4']
    
    # Create Regex
    regex = r'|'.join(bad_strs)
    
    # Replace from DF
    df['Column'].str.replace(regex,'', regex=True)
    
    

    Write to a JSON file

    def write_to_file(data):
        with open('data.json', 'w') as json_file:
            json.dump(data,json_file,indent=2)
    

    Read Text File to a List

    with open('filename.txt') as f:
        content = f.read()
        ls = content.split('\n')
    

    Download Image from URL

    def get_image(url):
        loc = os.getcwd()
        if url.endswith(('.jpg', '.jpeg', '.png', '.gif')):
            img = requests.get(url).content
            file_path = os.path.basename(url)
            with open(f'{loc}/{file_path}', 'wb') as f:
                f.write(img)
    
    get_image('https://fr.wiktionary.org/wiki/loriquet_arc-en-ciel#/media/Fichier:Rainbow_lorikeet.jpg')
    

    Convert Rows to a Database Format

    Sometimes you extract data from Screaming Frog, but each extracted value is added to a new column. You might want to convert that to a database format.

    data = pd.read_csv('your-crawl.csv')
    db = pd.melt(data, id_vars='Address', value_vars=data.iloc[:,1:], var_name='Fields', value_name='Extraction').dropna()
    

    Copy Data Into Clipboard

    When you don’t want to export your DataFrame just to copy data. Useful for list crawls in Screaming Frog.

    df.to_clipboard(index=False,header=False,sep=',')
    
    #or
    
    import pyperclip
    pyperclip.copy('Copy to clipboard')
    

    Apply Pareto

    If you want to focus only on the important stuff, it might be useful to apply pareto to large data sets. All you need is a data set with one column for the textual data (like urls) and one column for numerical data (like sessions or clicks).

    num_column = 'clicks'
    data['cum_sum'] = data[num_column].cumsum(skipna = True)
    data['cum_perc'] = 100 * data['cum_sum'] / data[num_column].sum()
    pareto = data[data.cum_perc &lt;= 80]
    pareto
    
    # URL                       clicks    cum_sum   cum_perc 
    # /seo                         100        100       0.60  
    # /python-for-seo               80        180       0.71  
    # /google-search-console        60        240       0.76  
    # /technical-seo                40        300       0.79  
    

    Find Casing of a List of Keywords

    We will use is methods to find the type of string we have in a list of keywords. You can see all available methods using dir(str).

    words = ['34','No Experience','CAT','no exp','no Exp','No exp',' ']
    word_dict = dict.fromkeys(words)
    meth = ['.isspace()','.isalpha()','.isdigit()','.islower()','.isspace()','.istitle()','.isupper()']
    for word in word_dict:
        for i in range(len(meth)):
            if eval('"%s"' % word + str(meth[i])):
                word_dict[word] = meth[i]
    
    word_dict
    
    '''
    Result
    {'34': '.isdigit()',
     'No Experience': '.istitle()',
     'CAT': '.isupper()',
     'no exp': '.islower()',
     'no Exp': None,
     'No exp': None,
     ' ': '.isspace()'}
    '''
    

    Filter Dataframe Using Another Dataframe Columns

    import pandas as pd
    
    rows_to_rm = df1['col'].tolist()
    df2 = df2[~df2['col'].isin(rows_to_rm)]
    

    Read Access Log Files

    If you want to try this code, just download a fake Access.log file from my Github account.

    import pandas as pd
    import re
    import requests
    
    
    # Get Log File Template from here 'https://github.com/jcchouinard/SEO-Projects/blob/master/access_log_20200602-101559.log'
    
    logs = 'access_log_20200602-101559.log'
    log_data = []
    columns = ['ip','date','http_request','status_code','count','request_url','user_agent']
    regex = '([(\d\.)]+) - - \[(.*?)\] "(.*?)" (\d+) (\d+) "(.*?)" "(.*?)"'
    
            
    with open(logs, 'r') as f:
    # Get components from each line of the log file into a structured dict
        for line in f:
            line = re.match(regex, line).groups()
            log_data.append(line)
            
    log_data = pd.DataFrame(log_data, columns=columns)
    
    log_data
    

    Make a Request to Reddit API

    Let’s make a simple request to Reddit API using requests. If you would like to do more complex things, you can also use Pushshift to query Reddit API.

    This script gets the top post from today on r/python.

    import requests
    import json 
    
    subreddit = 'python'
    count = 1
    timeframe = 'day' #hour, day, week, month, year, all
    listing = 'top' # controversial, best, hot, new, random, rising, top
    
    def get_reddit(subreddit,count):
        try:
            base_url = f'https://www.reddit.com/r/{subreddit}/{listing}.json?count={count}&t={timeframe}'
            request = requests.get(base_url, headers = {'User-agent': 'yourbot'})
        except:
            print('An Error Occured')
        return request.json()
    
    top_post = get_reddit(subreddit,count)
    
    if listing != 'random':
        title = top_post['data']['children'][0]['data']['title']
        url = top_post['data']['children'][0]['data']['url']
    else:
        title = top_post[0]['data']['children'][0]['data']['title']
        url = top_post[0]['data']['children'][0]['data']['url']
    
    
    print(f'{title}\n{url}')
    
    

    Load Multiple CSV Into One Dataframe

    import pandas as pd 
    import glob
    
    path = '/your/path' # use your path
    all_files = glob.glob(path + "/*_gsc_data.csv")
    
    data = []
    
    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0)
        data.append(df)
    
    merged_df = pd.concat(data, axis=0, ignore_index=True)
    

    Encode Special Characters

    from urllib.parse import quote, unquote
    
    quote('/jobs-in-Melbourne,-VIC')
    # '/jobs-in-Melbourne%2C-VIC'
    
    unquote('/jobs-in-Melbourne%2C-VIC')
    '/jobs-in-Melbourne,-VIC'
    

    Python’s default version to Python 3 on MAC OS X

    If you have installed Python without Anaconda on Mac, you will need to use python3 and pip3 commands instead of the obvious python and pip commands.

    This is true when you type python --version in Terminal and that the result is Python 2.7.16

    In Terminal, type:

    vi ~/.bash_profile
    

    It will open the vi editor where you can press I to enter edit mode. Add those lines and press esc. Write and quit the vi editor using :wq.

    alias python = 'python3'
    alias pip=pip3
    

    Then restart the Terminal and the python version should be python3 by default.

    $ python --version
    
    Python 3.8.3
    

    Work with Dates in Python

    Convert Dates to String and Strings to Date

    This is a bit of a housekeeping code that I add whenever I use dates. With these two functions, I can easily convert the dates to string and the strings to datetime.

    import datetime
    
    # Convert date to a string with format YYYY-MM-DD
    def date_to_str(date):
        if isinstance(date, datetime.datetime):
            date = datetime.datetime.strftime(date,'%Y-%m-%d')
        return date
    
    # Convert date to a string with format YYYY-MM-DD
    def str_to_date(date):
        if isinstance(date, str):
            date = datetime.datetime.strptime(date,'%Y-%m-%d')
        return date
    

    List Dates Between Two Dates

    For this function, you will need the date_to_str() and str_to_date() functions above.

    The list_dates() function will return a list of dates between the dates that we choose.

    import datetime
    
    def list_dates(startDate,endDate):
        start_date = str_to_date(startDate)
        end_date = str_to_date(endDate)
        delta = end_date - start_date
        days = []
        for i in range(delta.days + 1):
            day = start_date + datetime.timedelta(days=i)
            days.append(date_to_str(day))
        return days
    

    Check Last Day of the Month

    import datetime
    
    def last_day_of_month(date):
        next_month = str_to_date(date).replace(day=28) + datetime.timedelta(days=4)  # this will never fail
        return next_month - datetime.timedelta(days=next_month.day)
    

    Check Time to Run a Function

    import time
    
    def time_wait():
        time.sleep(3)
    
    start_time = time.time()
    time_wait()
    end_time = time.time()
    print("--- %s seconds ---" % (end_time - start_time))
    

    Efficiently get Status Codes from List of URLs

    pip install requests-futures

    import pandas as pd
    from requests_futures.sessions import FuturesSession
    
    def get_request(url):
        session = FuturesSession(max_workers=5)
        return session.head(url)
    
    def get_status_code(r):
        return r.result().status_code
    
    if __name__ == "__main__":
        urls = ['url1', 'url2', 'url3']
        df = pd.DataFrame({"url": urls})
        df["status_code"] = df["url"].apply(get_request).apply(get_status_code)
    
    

    Read Massive CSV Files with Pandas Chunk

    filename = 'data.csv' 
    
    def chunk_read_csv(filename,size):
        chunks = pd.read_csv(filename, chunksize=size)
    
        lsChunk = []  # Initialize List to store chunks
    
        for chunk in chunks:  
            print(chunk.index) # Print the Range Index
            lsChunk.append(chunk) # append the chunk to list
        
        print('Chunk Parsing: Done')
    
        # Concat the chunk list into dataframe 
        df = pd.concat(chunk_list)
        print('DF is Ready')
        
        return df
    
    chunk_read_csv(filename, 1000000)
    
    5/5 - (2 votes)