Extract frequent words from the text in Python
Tested on Python 2
To extract frequent words from some text we can work with Dict in Python. If you need to disable some words add them to stopword string
# prints 40 top keywords for the text
def PrintTopKeywords(text):
stopwords = 'I we he she a about an and are as at be by com for from how if in is it me more my myself of on or that the this to was what when where who will with you your'
stoplist = stopwords.split()
print "<h3>Top keywords</h3>"
wordList1 = []
wordList1 = text.lower().split(None)
wordList2 = []
for word1 in wordList1:
lastchar = word1[-1:]
if lastchar in [",", ".", "!", "?", ";"]:
word2 = word1.rstrip(lastchar)
else:
word2 = word1
# build a wordList2 of lower case modified words
if len(word2.lower()) > 1 and word2.lower() not in stoplist:
wordList2.append(word2.lower())
# create word frequency dictionary = hashtable
Dict = {}
for word2 in wordList2:
Dict[word2] = Dict.get(word2, 0) + 1
# create a list of keys and sort the list
keys = Dict.keys()
keys.sort()
#function inside a function
def byvalues(d):
return d[1]
items = sorted(Dict.items(), key=byvalues, reverse=True)
# Print the first 40
for item in items[:40]:
print "<h5>", item[0].encode('utf-8'), item[1], "</h5>"
############################################
…
tags: & category: -