This post is part of the complete Guide on Python for SEO
Here is a collection of useful Python functions that SEOs can use.
Generate Fake User-Agent
from fake_useragent import UserAgent ua = UserAgent() # ua.chrome header = {'User-Agent':str(ua.random)} url = 'https://www.example.com' response = requests.get(url, headers = header) BeautifulSoup(response.text, 'html.parser')
Check Robots.txt and Fetch With BeautifulSoup
# Import Libraries import requests from bs4 import BeautifulSoup from urllib.parse import urlparse from reppy.robots import Robots # Set URLs good_url = 'https://ca.indeed.com/Python-jobs' bad_url = 'https://ca.indeed.com/jobs?q=Python&from=brws&nc=brws' # Convert URL to extract robots.txt location def get_bot_loc(url): domain_url = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url)) bot_loc = domain_url + '/robots.txt' return bot_loc # Parse the Robots.txt to see if URL is allowed def robot_parser(url): bot_loc = get_bot_loc(url) parser = Robots.fetch(bot_loc) validation = parser.allowed(url, '*') return validation # Fetch URL and return HTML if URL is allowed in Robots.txt def fetch(url): validation = robot_parser(url) if validation: try: response = requests.get(url) except requests.exceptions.ConnectionError: print('Error: "%s" is not available!' % url) content = BeautifulSoup(response.text, 'lxml') else: content = '%s is blocked by robots.txt' % url return content fetch(bad_url) # 'https://ca.indeed.com/jobs?q=Python&from=brws&nc=brws is blocked by robots.txt' % url fetch(good_url) # HTML returned
Work With Directories
I use the os
module as much as I use pandas, and that says a lot.
import os os.getcwd() # Show working directory os.listdir() # Show content of a directory os.chdir('/New/path') # Change Working Directory os.mkdir('/path') # Create new directory os.path.exists('/path') # Check if path exists. Avoid creating a duplicate folder os.path.isfile('path/file.txt') # Check if file exists. Avoid creating a duplicate file def create_project(directory): if not os.path.exists(directory): print('Create project: '+ directory) os.makedirs(directory)
Replace Special Characters
# Escape Special Characters special_char = {'&':'&',"'":''','"':'"','>':'>','<':'<'} df['col'] = df['col'].replace(special_char, regex=True)
Replace Part of String From Dataframe Using a List
import pandas as pd # Create DF words = ['bad_str','good_str'] var = ['blabla_','_blabla'] lst = [] for word in words: for i in range(5): name = var[0]+word+str(i)+var[1] lst.append(name) df = pd.DataFrame(lst, columns=['Column']) # Set Strings to Remove bad_strs = ['bad_str0','bad_str1','bad_str2','bad_str3','bad_str4'] # Create Regex regex = r'|'.join(bad_strs) # Replace from DF df['Column'].str.replace(regex,'', regex=True)
Write to a JSON file
def write_to_file(data): with open('data.json', 'w') as json_file: json.dump(data,json_file,indent=2)
Download Image from URL
def get_image(url): loc = os.getcwd() if url.endswith(('.jpg', '.jpeg', '.png', '.gif')): img = requests.get(url).content file_path = os.path.basename(url) with open(f'{loc}/{file_path}', 'wb') as f: f.write(img) get_image('https://fr.wiktionary.org/wiki/loriquet_arc-en-ciel#/media/Fichier:Rainbow_lorikeet.jpg')
Convert Rows to a Database Format
Sometimes you extract data from Screaming Frog, but each extracted value is added to a new column. You might want to convert that to a database format.
data = pd.read_csv('your-crawl.csv') db = pd.melt(data, id_vars='Address', value_vars=data.iloc[:,1:], var_name='Fields', value_name='Extraction').dropna()
Copy Data Into Clipboard
When you don’t want to export your DataFrame just to copy data. Useful for list crawls in Screaming Frog.
df.to_clipboard(index=False,header=False,sep=',') #or import pyperclip pyperclip.copy('Copy to clipboard')
Apply Pareto
If you want to focus only on the important stuff, it might be useful to apply pareto to large data sets. All you need is a data set with one column for the textual data (like urls) and one column for numerical data (like sessions or clicks).
num_column = 'clicks' data['cum_sum'] = data[num_column].cumsum(skipna = True) data['cum_perc'] = 100 * data['cum_sum'] / data[num_column].sum() pareto = data[data.cum_perc <= 80] pareto # URL clicks cum_sum cum_perc # /seo 100 100 0.60 # /python-for-seo 80 180 0.71 # /google-search-console 60 240 0.76 # /technical-seo 40 300 0.79
Find Casing of a List of Keywords
We will use is methods to find the type of string we have in a list of keywords. You can see all available methods using dir(str)
.
words = ['34','No Experience','CAT','no exp','no Exp','No exp',' '] word_dict = dict.fromkeys(words) meth = ['.isspace()','.isalpha()','.isdigit()','.islower()','.isspace()','.istitle()','.isupper()'] for word in word_dict: for i in range(len(meth)): if eval('"%s"' % word + str(meth[i])): word_dict[word] = meth[i] word_dict
''' Result {'34': '.isdigit()', 'No Experience': '.istitle()', 'CAT': '.isupper()', 'no exp': '.islower()', 'no Exp': None, 'No exp': None, ' ': '.isspace()'} '''
Filter Dataframe Using Another Dataframe Columns
import pandas as pd rows_to_rm = df1['col'].tolist() df2 = df2[~df2['col'].isin(rows_to_rm)]
Read Access Log Files
If you want to try this code, just download a fake Access.log file from my Github account.
import pandas as pd import re import requests # Get Log File Template from here 'https://github.com/jcchouinard/SEO-Projects/blob/master/access_log_20200602-101559.log' logs = 'access_log_20200602-101559.log' log_data = [] columns = ['ip','date','http_request','status_code','count','request_url','user_agent'] regex = '([(\d\.)]+) - - \[(.*?)\] "(.*?)" (\d+) (\d+) "(.*?)" "(.*?)"' with open(logs, 'r') as f: # Get components from each line of the log file into a structured dict for line in f: line = re.match(regex, line).groups() log_data.append(line) log_data = pd.DataFrame(log_data, columns=columns) log_data
Make a Request to Reddit API
Let’s make a simple request to Reddit API using requests
. If you would like to do more complex things, you can also use Pushshift to query Reddit API.
This script gets the top post from today on r/python
.
import requests import json subreddit = 'python' count = 1 timeframe = 'day' #hour, day, week, month, year, all listing = 'top' # controversial, best, hot, new, random, rising, top def get_reddit(subreddit,count): try: base_url = f'https://www.reddit.com/r/{subreddit}/{listing}.json?count={count}&t={timeframe}' request = requests.get(base_url, headers = {'User-agent': 'yourbot'}) except: print('An Error Occured') return request.json() top_post = get_reddit(subreddit,count) if listing != 'random': title = top_post['data']['children'][0]['data']['title'] url = top_post['data']['children'][0]['data']['url'] else: title = top_post[0]['data']['children'][0]['data']['title'] url = top_post[0]['data']['children'][0]['data']['url'] print(f'{title}\n{url}')
Load Multiple CSV Into One Dataframe
import pandas as pd import glob path = '/your/path' # use your path all_files = glob.glob(path + "/*_gsc_data.csv") data = [] for filename in all_files: df = pd.read_csv(filename, index_col=None, header=0) data.append(df) merged_df = pd.concat(data, axis=0, ignore_index=True)
Encode Special Characters
from urllib.parse import quote, unquote quote('/jobs-in-Melbourne,-VIC') # '/jobs-in-Melbourne%2C-VIC' unquote('/jobs-in-Melbourne%2C-VIC') '/jobs-in-Melbourne,-VIC'
Python’s default version to Python 3 on MAC OS X
If you have installed Python without Anaconda on Mac, you will need to use python3
and pip3
commands instead of the obvious python
and pip
commands.
This is true when you type python --version
in Terminal and that the result is Python 2.7.16
In Terminal, type:
vi ~/.bash_profile
It will open the vi
editor where you can press I
to enter edit mode. Add those lines and press esc
. Write and quit the vi editor using :wq
.
alias python = 'python3' alias pip=pip3
Then restart the Terminal and the python version should be python3 by default.
$ python --version
Python 3.8.3
Work with Dates in Python
Convert Dates to String and Strings to Date
This is a bit of a housekeeping code that I add whenever I use dates. With these two functions, I can easily convert the dates to string and the strings to datetime.
import datetime # Convert date to a string with format YYYY-MM-DD def date_to_str(date): if isinstance(date, datetime.datetime): date = datetime.datetime.strftime(date,'%Y-%m-%d') return date # Convert date to a string with format YYYY-MM-DD def str_to_date(date): if isinstance(date, str): date = datetime.datetime.strptime(date,'%Y-%m-%d') return date
List Dates Between Two Dates
For this function, you will need the date_to_str()
and str_to_date()
functions above.
The list_dates()
function will return a list of dates between the dates that we choose.
import datetime def list_dates(startDate,endDate): start_date = str_to_date(startDate) end_date = str_to_date(endDate) delta = end_date - start_date days = [] for i in range(delta.days + 1): day = start_date + datetime.timedelta(days=i) days.append(date_to_str(day)) return days
Check Last Day of the Month
import datetime def last_day_of_month(date): next_month = str_to_date(date).replace(day=28) + datetime.timedelta(days=4) # this will never fail return next_month - datetime.timedelta(days=next_month.day)
Check Time to Run a Function
import time def time_wait(): time.sleep(3) start_time = time.time() time_wait() end_time = time.time() print("--- %s seconds ---" % (end_time - start_time))
Efficiently get Status Codes from List of URLs
pip install requests-futures
import pandas as pd from requests_futures.sessions import FuturesSession def get_request(url): session = FuturesSession(max_workers=5) return session.head(url) def get_status_code(r): return r.result().status_code if __name__ == "__main__": urls = ['url1', 'url2', 'url3'] df = pd.DataFrame({"url": urls}) df["status_code"] = df["url"].apply(get_request).apply(get_status_code)
Read Massive CSV Files with Pandas Chunk
filename = 'data.csv' def chunk_read_csv(filename,size): chunks = pd.read_csv(filename, chunksize=size) lsChunk = [] # Initialize List to store chunks for chunk in chunks: print(chunk.index) # Print the Range Index lsChunk.append(chunk) # append the chunk to list print('Chunk Parsing: Done') # Concat the chunk list into dataframe df = pd.concat(chunk_list) print('DF is Ready') return df chunk_read_csv(filename, 1000000)
Sr SEO Specialist at Seek (Melbourne, Australia). Specialized in technical SEO. In a quest to programmatic SEO for large organizations through the use of Python, R and machine learning.