Here is a simple script that I am using to ping 50 sites at a time and check if they are up or not. If not, save the down time with error in MongoDB.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from urllib.request import urlopen, Request
from threading import Thread
from time import sleep, time
import datetime
import queue
import pymongo
''' A Simple script to ping multiple sites at a time
and capture the down sites
'''
__author__ = "Aamir khan"
__version__ = 1.1
_MAX_CONNECTIONS = 50
counter = 0
downsites = []
now = datetime.datetime.utcnow # time stamp
# DO NOT ON BOTH AT THE SAME TIME
_DEBUG = False
_MONITOR = True
def getcollection(db='websites', colname='website_urls'):
return pymongo.MongoClient().get_database(db).get_collection(colname)
# to save downsites in db
ds = getcollection(colname="downsites")
# fetch urls from db
if _DEBUG:
print("Fetching Urls")
urls = getcollection().find()
print("%d Urls Fetched" % urls.count())
print("pulling urls to the queue")
q = queue.Queue(urls.count())
for url in urls:
url = url['url']
q.put(url)
print("pulled urls to the queue")
print("The size of the Queue is %d" % q.qsize())
else:
urls = getcollection().find()
q = queue.Queue(urls.count())
for url in urls:
url = url['url']
q.put(url)
del urls
def inc_counter():
global counter
# lock.acquire()
counter += 1
# lock.release()
def monitor():
total = q.qsize()
if total > 0:
while counter < total:
print("%d Request sent" % counter)
sleep(1)
print("Total {}/{} Request Sent".format(counter, total))
assert counter == total
def ping(uri):
req = Request(uri, headers={
"User-Agent": ("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:51.0)"
" Gecko/20100101 Firefox/51.0")
})
req.get_method = lambda: 'HEAD'
try:
with urlopen(req) as r:
res = r.getcode(), uri, now()
except Exception as e:
res = str(e), uri, now()
finally:
if _DEBUG:
err, uri, last_check = res
print("Requesting = ", uri, "Request Method = ", req.get_method(),
"\nstatus = ", err, "time", last_check)
print("-----" * 10)
if _MONITOR:
inc_counter()
sleep(1)
sleep(0.5) # sleep a while to release the workload from cpu
return res
def process(url):
err, uri, last_check = ping(url)
if err != 200:
ds.insert_one({"Error": err.strip('<>'),
"url": uri, "last_checked": last_check})
def worker():
while True:
url = q.get()
if url is None:
break
process(url)
q.task_done()
if __name__ == '__main__':
workers = []
if _MONITOR:
Thread(target=monitor).start() # start monitoring reqest/sec
start_time = time()
for i in range(_MAX_CONNECTIONS):
t = Thread(target=worker)
t.start()
workers.append(t)
# block until all tasks are done
q.join()
# poision kill
for i in range(_MAX_CONNECTIONS):
q.put(None)
# wait for all the threads to join
for w in workers:
w.join()
if _MONITOR:
print("Time taken %f (sec)" % (time() - start_time))
Questions:
- Can I make use of better threading techniques?
- Can I eliminate the duplication of code while
_DEBUG
is on line 35 to 54? - I would love/prefer to see a functional version of this program.
- How can I improve the performance? (my target is to ping 1000000 sites under an hour)