I have a working code, but it has a lot of flaws.
The main project is to get a value from a website. I need to complete this action million of times, so i download the webpage returned from the class method, and parse it afterwards. In case the page is already downloaded i just open that one up.
This code is working, however even with the multiple except socket.error
i do sometimes get socket errors when running it in parallel.
Also i am kinda new to programming, so any advice would be really appreciated. (I know this code is kinda too long)
class Uniprot:
"""
Gets the UNIPROT XML or HTML, very bad code.... Cant come up with a better one yet :(
"""
def __init__(self, uniid):
"""
Initialize stuff
:param uniid: Same as in parent class
:return:
"""
self.uniid = uniid.upper()
self.soup = ""
def xml(self):
"""
returns the full XML file
:return:
"""
xmlpage = urllib2.urlopen("http://www.uniprot.org/uniprot/%s.xml" % self.uniid, timeout=2)
if xmlpage.getcode() != 200:
return
self.soup = BeautifulSoup(xmlpage)
return self.soup.get_text
def web(self):
"""
Returns the full HTML file
:return:
"""
webpage = urllib2.urlopen("http://www.uniprot.org/uniprot/%s" % self.uniid, timeout=2)
# UNIPROT error handler
if webpage.getcode() != 200:
return
websoup = BeautifulSoup(webpage)
return websoup.get_text
def uniprotannotation():
"""
Chaos, and anarchy right again, same with the errors, plus its an HTML variable instead of an XML
:return: Annotation score from UNIPROT
"""
uniprotxml = ""
soup = ""
# Check if the folder is present
if not os.path.isdir("%s/%s/UNIPROT" % (DATA_DIR, FASTA_OBJECT.organism())):
os.makedirs("%s/%s/UNIPROT" % (DATA_DIR, FASTA_OBJECT.organism()))
# Check if the file is present, if not create it
try:
f = open("%s/%s/UNIPROT/%s.html" % (DATA_DIR,
FASTA_OBJECT.organism(), FASTA_OBJECT.accession()), "r")
for i in f:
uniprotxml += i
f.close()
# Check if the file is empty
if os.stat("%s/%s/UNIPROT/%s.html" % (DATA_DIR,
FASTA_OBJECT.organism(), FASTA_OBJECT.accession())).st_size == 0:
os.remove("%s/%s/UNIPROT/%s.html" % (DATA_DIR,
FASTA_OBJECT.organism(), FASTA_OBJECT.accession()))
sys.stderr.write(
"%s, removed empty html, running it again\n" % sys.argv[1])
uniprotannotation()
soup = BeautifulSoup(uniprotxml)
except IOError:
try:
soup = BeautifulSoup(str(FASTA_OBJECT.Uniprot(sys.argv[1]).web()))
except socket.timeout:
sys.stderr.write(
"%s, HTML socket timeout, trying again\n" % sys.argv[1])
uniprotannotation()
except socket.error:
sys.stderr.write(
"%s, HTML socket error, trying again\n" % sys.argv[1])
uniprotannotation()
f = open("%s/%s/UNIPROT/%s.html" % (DATA_DIR,
FASTA_OBJECT.organism(), FASTA_OBJECT.accession()), "w")
f.write(str(FASTA_OBJECT.Uniprot(sys.argv[1]).web()))
f.close()
if not soup:
return "-"
result = re.search("Annotation score: \d out of \d", str(soup.get_text))
if not result:
return "-"
return re.search("\d", str(result.group())).group()[0]