Tell me more ×

Code Review Stack Exchange is a question and answer site for peer programmer code reviews. It's 100% free, no registration required.

NLTK language detection code in Python

up vote 1 down vote favorite

I need to write some code that checks thousands of websites, either if they are in English or not. Bellow is the source code. Any improvements would be appreciated.

import nltk
import urllib2
import re
import unicodedata

ENGLISH_STOPWORDS = set(nltk.corpus.stopwords.words('english'))
NON_ENGLISH_STOPWORDS = set(nltk.corpus.stopwords.words()) - ENGLISH_STOPWORDS

STOPWORDS_DICT = {lang: set(nltk.corpus.stopwords.words(lang)) for lang in nltk.corpus.stopwords.fileids()}

def get_language(text):
    words = set(nltk.wordpunct_tokenize(text.lower()))
    return max(((lang, len(words & stopwords)) for lang, stopwords in STOPWORDS_DICT.items()), key=lambda x: x[1])[0]


def checkEnglish(text):
    if text is None:
        return 0
    else:
        text = unicode(text, errors='replace')
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
        text = text.lower()
    words = set(nltk.wordpunct_tokenize(text))
    if len(words & ENGLISH_STOPWORDS) > len(words & NON_ENGLISH_STOPWORDS):
        return 1
    else:
        return 0


def getPage(url):
    if not url.startswith("http://"):
        url = "http://" + url
    print "Checking the site ", url
    req = urllib2.Request(url)
    try:
        response = urllib2.urlopen(req)
        rstPage = response.read()
    except urllib2.HTTPError, e:
        rstPage = None
    except urllib2.URLError, e:
        rstPage = None
    except Exception, e:
        rstPage = None
    return rstPage


def getPtag(webPage):
    if webPage is None:
        return None
    else:
        rst = re.search(r'<p\W*(.+)\W*</p>', webPage)
        if rst is not None:
            return rst.group(1)
        else:
            return rst


def getDescription(webPage):
    if webPage is None:
        return None
    else:
        des = re.search(r'<meta\s+.+\"[Dd]escription\"\s+content=\"(.+)\"\s*/*>', webPage)
        if des is not None:
            return des.group(1)
        else:
            return des


def checking(url):
    pageText = getPage(url)
    if pageText is not None:
        if checkEnglish(getDescription(pageText)) == 1:
            return '1'
        elif checkEnglish(getPtag(pageText)) == 1:
            return '1'
        elif checkEnglish(pageText) == 1:
            return '1'
        else:
            return '0'
    else:
        return 'NULL'

if __name__ == "__main__":
    f = open('sample_domain_list.txt').readlines()
    s = open('newestResult.txt', "w")
    for line in f[:20]:
        url = line.split(',')[1][1:-1]
        check = checking(url)
        s.write(url + ',' + check)
        s.write('\n')
        print check

#    f.close()
    s.close()

edited Jul 31 '12 at 9:28

Coral Doe
1798

asked Jul 28 '12 at 7:57

akhter wahab
1484

Does your code work as you intend it to? What problems do you see with it? (To help us focus on those...) – mac389 Aug 16 '12 at 13:57

Know someone who can answer? Share a link to this question via email, Google+, Twitter, or Facebook.

Your Answer

Sign up or log in

Post as a guest

Name

Email required, but not shown

Post as a guest

Name

Email required, but not shown

discard

By posting your answer, you agree to the privacy policy and terms of service.

Browse other questions tagged python parsing or ask your own question.

question feed

asked	9 months ago
viewed	274 times

NLTK language detection code in Python

Know someone who can answer? Share a link to this question via email, Google+, Twitter, or Facebook.

Your Answer

Sign up or log in

Post as a guest

Browse other questions tagged python parsing or ask your own question.

Related