Tell me more ×
Code Review Stack Exchange is a question and answer site for peer programmer code reviews. It's 100% free, no registration required.

I need to write some code that checks thousands of websites, either if they are in English or not. Bellow is the source code. Any improvements would be appreciated.

import nltk
import urllib2
import re
import unicodedata

ENGLISH_STOPWORDS = set(nltk.corpus.stopwords.words('english'))
NON_ENGLISH_STOPWORDS = set(nltk.corpus.stopwords.words()) - ENGLISH_STOPWORDS

STOPWORDS_DICT = {lang: set(nltk.corpus.stopwords.words(lang)) for lang in nltk.corpus.stopwords.fileids()}

def get_language(text):
    words = set(nltk.wordpunct_tokenize(text.lower()))
    return max(((lang, len(words & stopwords)) for lang, stopwords in STOPWORDS_DICT.items()), key=lambda x: x[1])[0]


def checkEnglish(text):
    if text is None:
        return 0
    else:
        text = unicode(text, errors='replace')
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
        text = text.lower()
    words = set(nltk.wordpunct_tokenize(text))
    if len(words & ENGLISH_STOPWORDS) > len(words & NON_ENGLISH_STOPWORDS):
        return 1
    else:
        return 0


def getPage(url):
    if not url.startswith("http://"):
        url = "http://" + url
    print "Checking the site ", url
    req = urllib2.Request(url)
    try:
        response = urllib2.urlopen(req)
        rstPage = response.read()
    except urllib2.HTTPError, e:
        rstPage = None
    except urllib2.URLError, e:
        rstPage = None
    except Exception, e:
        rstPage = None
    return rstPage


def getPtag(webPage):
    if webPage is None:
        return None
    else:
        rst = re.search(r'<p\W*(.+)\W*</p>', webPage)
        if rst is not None:
            return rst.group(1)
        else:
            return rst


def getDescription(webPage):
    if webPage is None:
        return None
    else:
        des = re.search(r'<meta\s+.+\"[Dd]escription\"\s+content=\"(.+)\"\s*/*>', webPage)
        if des is not None:
            return des.group(1)
        else:
            return des


def checking(url):
    pageText = getPage(url)
    if pageText is not None:
        if checkEnglish(getDescription(pageText)) == 1:
            return '1'
        elif checkEnglish(getPtag(pageText)) == 1:
            return '1'
        elif checkEnglish(pageText) == 1:
            return '1'
        else:
            return '0'
    else:
        return 'NULL'

if __name__ == "__main__":
    f = open('sample_domain_list.txt').readlines()
    s = open('newestResult.txt', "w")
    for line in f[:20]:
        url = line.split(',')[1][1:-1]
        check = checking(url)
        s.write(url + ',' + check)
        s.write('\n')
        print check

#    f.close()
    s.close()
share|improve this question
Does your code work as you intend it to? What problems do you see with it? (To help us focus on those...) – mac389 Aug 16 '12 at 13:57

Know someone who can answer? Share a link to this question via email, Google+, Twitter, or Facebook.

Your Answer

 
discard

By posting your answer, you agree to the privacy policy and terms of service.

Browse other questions tagged or ask your own question.