make more app like, takes gram size as arg

This commit is contained in:
Dan Ballard 2013-01-10 12:09:03 -05:00
parent 304d676915
commit 903629e333
1 changed files with 33 additions and 21 deletions

View File

@ -4,15 +4,31 @@ import sys
import re import re
class NGramSet: class NGramSet:
def __init__(self, gram_size): def __init__(self, filename, gram_size):
self.gram_size = gram_size self.gram_size = gram_size
self.filename = filename
self.text = None
self.gram_set = {} self.gram_set = {}
self.total_count = 0 self.total_count = 0
def read_file(self):
# open and read file
src = open(self.filename)
text = src.read()
# normalize text to remove extra space and characters
text = text.lower() # lower case chars
text = re.sub('[^a-z]', ' ', text) # remove anything not a-z
text = re.sub(' *', ' ', text) # shrink all multi spaces to a single
self.text= text
# takes in text and populates class with ngram info # takes in text and populates class with ngram info
def process(self, text): def process(self):
if (self.text == None):
self.read_file()
gram = [] gram = []
for word in text.split(" "): for word in self.text.split(" "):
if (word == ''): if (word == ''):
continue continue
if (len(gram) >= self.gram_size): if (len(gram) >= self.gram_size):
@ -31,26 +47,22 @@ class NGramSet:
for key in self.gram_set: for key in self.gram_set:
self.gram_set[key]['percent'] = self.gram_set[key]['count'] / float(self.total_count) self.gram_set[key]['percent'] = self.gram_set[key]['count'] / float(self.total_count)
if len(sys.argv) < 2: def print_mysql(self):
print "Usage: gramificate.py [FILE]" print "mysql"
if len(sys.argv) < 3:
print "Usage: gramificate.py [N] [FILE]"
print " N Gram size"
print " FILE Filename"
print "Output: Mysql commands to create a table FILE-N that contains all the grams and"
print " associated stats (count of gram, percent of total). Can be directly and"
print " safely piped into mysql:"
print " mysql -u USER -pPass -D gram_db < ./gramificate.py 2 input.txt"
exit() exit()
# open and read file
src = open(sys.argv[1])
text = src.read()
# normalize text to remove extra space and characters
text = text.lower() # lower case chars
text = re.sub('[^a-z]', ' ', text) # remove anything not a-z
text = re.sub(' *', ' ', text) # shrink all multi spaces to a single
# generate ngrams # generate ngrams
one_grams = NGramSet(1) grams = NGramSet(sys.argv[2], int(sys.argv[1]))
one_grams.process(text) grams.process()
print one_grams.gram_set print grams.gram_set
two_grams = NGramSet(2)
two_grams.process(text)
print two_grams.gram_set
# generate stats about 'grams from gram set