I wrote a crawler that for every page visited collects the status code. Below my solution. Is this code optimizable?
import urllib
def getfromurl(url):
start = urllib.urlopen(url)
raw = ''
for lines in start.readlines():
raw += lines
start.close()
return raw
def dumbwork(start_link, start_url, text, pattern, counter):
if counter < 2:
counter = counter +1
while start_link != -1:
try:
start_url = text.find('/', start_link)
end_url = text.find('"', start_url + 1)
url = 'http:/' + text[start_url + 1 : end_url]
page_status = str(urllib.urlopen(url).getcode())
row = url + ', ' + page_status
t.write(row + '\n')
temp = str(getfromurl(url))
print row
dumbwork(temp.find(pattern), 0, temp, pattern, counter)
start_link = text.find(pattern, end_url + 1)
except Exception, e:
break
else:
pass
t = open('inout.txt', 'w')
text = str(getfromurl('http://www.site.it'))
pattern = '<a href="http:/'
start_link = text.find(pattern)
dumbwork(start_link, 0, text, pattern, 0)
t.close()
return (requests.head(url)).status_code
from requests module do this for you ? I'm usually using this module as it's straight-forward and you have like a lot less headaches if you use it overurllib
\$\endgroup\$