Initial commit
This commit is contained in:
commit
304d676915
|
@ -0,0 +1,56 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import sys
|
||||
import re
|
||||
|
||||
class NGramSet:
|
||||
def __init__(self, gram_size):
|
||||
self.gram_size = gram_size
|
||||
self.gram_set = {}
|
||||
self.total_count = 0
|
||||
|
||||
# takes in text and populates class with ngram info
|
||||
def process(self, text):
|
||||
gram = []
|
||||
for word in text.split(" "):
|
||||
if (word == ''):
|
||||
continue
|
||||
if (len(gram) >= self.gram_size):
|
||||
gram.pop(0)
|
||||
gram.append(word)
|
||||
if (len(gram) == self.gram_size):
|
||||
self.total_count += 1
|
||||
key = ' '.join(gram)
|
||||
if (not self.gram_set.has_key(key)):
|
||||
self.gram_set[key] = {'gram': list(gram), 'count': 1}
|
||||
else:
|
||||
self.gram_set[key]['count'] += 1
|
||||
self.generate_gram_stats()
|
||||
|
||||
def generate_gram_stats(self):
|
||||
for key in self.gram_set:
|
||||
self.gram_set[key]['percent'] = self.gram_set[key]['count'] / float(self.total_count)
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print "Usage: gramificate.py [FILE]"
|
||||
exit()
|
||||
|
||||
# open and read file
|
||||
src = open(sys.argv[1])
|
||||
text = src.read()
|
||||
|
||||
# normalize text to remove extra space and characters
|
||||
text = text.lower() # lower case chars
|
||||
text = re.sub('[^a-z]', ' ', text) # remove anything not a-z
|
||||
text = re.sub(' *', ' ', text) # shrink all multi spaces to a single
|
||||
|
||||
# generate ngrams
|
||||
one_grams = NGramSet(1)
|
||||
one_grams.process(text)
|
||||
print one_grams.gram_set
|
||||
two_grams = NGramSet(2)
|
||||
two_grams.process(text)
|
||||
print two_grams.gram_set
|
||||
|
||||
# generate stats about 'grams from gram set
|
||||
|
Loading…
Reference in New Issue