The code does seem a bit repetitive in places such as the parenturlscraper
module and the childurlscraper
module.
Does anyone have any recommendations for improving my code and condensing it a little?
In essence, the code scrapes this site and populates a table with details about each crash, geocoding the location data extracted from the site using Google.
__version__ = '0.1'
__author__ = 'antmancoder'
# Importing of modules required for the script to run successfully
import scraperwiki
import lxml.html
import urlparse
import urllib2
import dateutil.parser
from geopy import geocoders
# Introduction of various global variables required throughout the running of the code
urlstem = "http://planecrashinfo.com"
urlyeardb = "database.htm"
yearsource = urlparse.urljoin(urlstem, urlyeardb)
yearlist = []
sourcepageurl = []
def parenturlscraper():
"""Function scrapes all of the parent URLs from 'planecrashinfo.com/database'"""
html = scraperwiki.scrape(yearsource)
root = lxml.html.fromstring(html)
hrefs = root.cssselect('td a')
for href in hrefs:
link = href.attrib['href']
url = urlparse.urljoin(urlstem, link)
yearlist.append(url)
def childurlscraper():
"""Function scrapes all of the child URLs from those scraped in the parenturlscraper module"""
for url in yearlist:
html = scraperwiki.scrape(url)
root = lxml.html.fromstring(html)
hrefs = root.cssselect('td a')
url = url[0:34]
for href in hrefs:
linkurl = href.attrib['href']
url = urlparse.urljoin(url, linkurl)
sourcepageurl.append(url)
def sourcepagescraper():
"""Function scrapes respective data for each accident and placed it into DB"""
for url in sourcepageurl:
try:
html = scraperwiki.scrape(url)
root = lxml.html.fromstring(html)
for tr in root.cssselect("body"):
tds = tr.cssselect("td")
location = coorlookup(tds[7].text_content())
for td in tds:
crashinfo = {}
crashinfo['url'] = url
crashinfo['date'] = dateutil.parser.parse(tds[3].text_content()).date()
crashinfo['time'] = tds[5].text_content()
crashinfo['location'] = tds[7].text_content()
crashinfo['latitude'] = location[1][0]
crashinfo['longitude'] = location[1][1]
crashinfo['operator'] = tds[9].text_content()
crashinfo['flight no'] = tds[11].text_content()
crashinfo['route'] = tds[13].text_content()
crashinfo['aircraft type'] = tds[15].text_content()
crashinfo['registration'] = tds[17].text_content()
crashinfo['cn ln'] = tds[19].text_content()
crashinfo['aboard'] = tds[21].text_content()
crashinfo['fatalities'] = tds[23].text_content()
crashinfo['ground'] = tds[25].text_content()
crashinfo['summary'] = tds[27].text_content()
scraperwiki.sqlite.save(unique_keys=['url'], data=crashinfo)
except urllib2.HTTPError, err:
if err.code == 404:
continue
def coorlookup(location):
"""Function is called from the 'sourcepagescraper' function to geolocate locations listed on website for each accident"""
g = geocoders.Google()
try:
loc = g.geocode(location, exactly_one=True)
return loc
except:
return ("",("",""))
parenturlscraper()
childurlscraper()
sourcepagescraper()