Description:
Useful SEO scripts π I use often
I have been trying to keep my code more organized and especially scripts/functions I use more often.
Here are some useful Python scripts for common SEO tasks that I refer to frequently.
CTR Curves
Useful for gathering simple Share-of-Traffic information from ranking position.
Data can be found at: Advanced Web Ranking - google-organic-ctr
Most major SEO tool providers have individualized CTR curves for keywords or keyword sets but this can be useful for
analyzing competitors or different SERP features.
def add_ctr ( x ):
vals = { '1' : 19.5 ,
'2' : 13.15 ,
'3' : 8.99 ,
'4' : 5.74 ,
'5' : 4.07 ,
'6' : 3.05 ,
'7' : 2.41 ,
'8' : 1.68 ,
'9' : 1.44 ,
'10' : 0.9 ,
'11' : 1.35 ,
'12' : 1.35 ,
'13' : 1.5 ,
'14' : 1.32 ,
'15' : 1.34 ,
'16' : 1.18 ,
'17' : 1.16 ,
'18' : 0.94 ,
'19' : 0.82 ,
'20' : 0.59 }
return vals[ str (x)]
Random User-Agent Strings
Helpful for crawling pages that donβt like default Python Requests library User-Agents.
import random
def getUA ():
uastrings = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36" ,\
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36" ,\
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10) AppleWebKit/600.1.25 (KHTML, like Gecko) Version/8.0 Safari/600.1.25" ,\
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0" ,\
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36" ,\
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36" ,\
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/600.1.17 (KHTML, like Gecko) Version/7.1 Safari/537.85.10" ,\
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" ,\
"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0" ,\
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36" \
]
return random.choice(uastrings)
Pull Title and Description from URL
Simple example of using the requests and BeautifulSoup libraries to extracted
on-page or meta data( <meta name="description" content="..." />
) from live URLs.
from bs4 import BeautifulSoup
import requests
# Use beautifulsoup to fetch page titles.
def fetch_meta ( url ):
#Set UA as Googlebot as the server will only serve to GB
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36' }
result = { 'title' : '' , 'description' : '' , 'error' : '' }
try :
#Send a GET request to grab the file
response = requests.get(url, headers = headers, timeout = 3 )
#Parse the response
soup = BeautifulSoup(response.text)
#Extract the title
result[ 'title' ] = soup.title.string
description_tag = soup.find( 'meta' , attrs = { 'name' : 'description' })
if description_tag is not None :
result[ 'description' ] = description_tag.get( 'content' )
except Exception as e:
result[ 'error' ] = str (e)
return result
Crawl a Website by Specifying a Sitemap URL
This spider will crawl pages specified in an XML sitemap(s), and extract standard SEO elements (title, h1, h2, page size, etc.).
The output of this crawl will be saved to a CSV file.
from scrapy.spiders import SitemapSpider
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36' ,
class SEOSitemapSpider ( SitemapSpider ):
name = 'seo_sitemap_spider'
sitemap_urls = [
'https://www.example.com/sitemap-index.xml' , # either will work, regular sitemaps or sitemap index files
'https://www.example.com/sitemap_1.xml' ,
'https://www.example.com/sitemap_2.xml' ,
]
custom_settings = {
'USER_AGENT' : user_agent,
'DOWNLOAD_DELAY' : 0 , # adjust this to 3 or 4 seconds to prevent getting blocked...
'ROBOTSTXT_OBEY' : True , # To obey or not to obey...
'HTTPERROR_ALLOW_ALL' : True
}
def parse ( self , response ):
yield dict (
url = response.url,
title = '@@' .join(response.css( 'title::text' ).getall()),
meta_desc = response.xpath( "//meta[@name='description']/@content" ).get(),
h1 = '@@' .join(response.css( 'h1::text' ).getall()),
h2 = '@@' .join(response.css( 'h2::text' ).getall()),
h3 = '@@' .join(response.css( 'h3::text' ).getall()),
body_text = ' \n ' .join(response.css( 'p::text' ).getall()),
size = len (response.body),
load_time = response.meta[ 'download_latency' ],
status = response.status,
links_href = '@@' .join([link.attrib.get( 'href' ) or '' for link in response.css( 'a' )]),
links_text = '@@' .join([link.attrib.get( 'title' ) or '' for link in response.css( 'a' )]),
img_src = '@@' .join([im.attrib.get( 'src' ) or '' for im in response.css( 'img' )]),
img_alt = '@@' .join([im.attrib.get( 'alt' ) or '' for im in response.css( 'img' )]),
page_depth = response.meta[ 'depth' ],
)
To see some more examples and useful SEO scripts, check out some of my other posts by searching for βCodeβ in the post title