diff --git a/gramificate.py b/gramificate.py index 810fddf..ea5ed58 100755 --- a/gramificate.py +++ b/gramificate.py @@ -4,15 +4,31 @@ import sys import re class NGramSet: - def __init__(self, gram_size): + def __init__(self, filename, gram_size): self.gram_size = gram_size + self.filename = filename + self.text = None self.gram_set = {} self.total_count = 0 + def read_file(self): + # open and read file + src = open(self.filename) + text = src.read() + + # normalize text to remove extra space and characters + text = text.lower() # lower case chars + text = re.sub('[^a-z]', ' ', text) # remove anything not a-z + text = re.sub(' *', ' ', text) # shrink all multi spaces to a single + self.text= text + + # takes in text and populates class with ngram info - def process(self, text): + def process(self): + if (self.text == None): + self.read_file() gram = [] - for word in text.split(" "): + for word in self.text.split(" "): if (word == ''): continue if (len(gram) >= self.gram_size): @@ -31,26 +47,22 @@ class NGramSet: for key in self.gram_set: self.gram_set[key]['percent'] = self.gram_set[key]['count'] / float(self.total_count) -if len(sys.argv) < 2: - print "Usage: gramificate.py [FILE]" + def print_mysql(self): + print "mysql" + +if len(sys.argv) < 3: + print "Usage: gramificate.py [N] [FILE]" + print " N Gram size" + print " FILE Filename" + print "Output: Mysql commands to create a table FILE-N that contains all the grams and" + print " associated stats (count of gram, percent of total). Can be directly and" + print " safely piped into mysql:" + print " mysql -u USER -pPass -D gram_db < ./gramificate.py 2 input.txt" exit() -# open and read file -src = open(sys.argv[1]) -text = src.read() - -# normalize text to remove extra space and characters -text = text.lower() # lower case chars -text = re.sub('[^a-z]', ' ', text) # remove anything not a-z -text = re.sub(' *', ' ', text) # shrink all multi spaces to a single - # generate ngrams -one_grams = NGramSet(1) -one_grams.process(text) -print one_grams.gram_set -two_grams = NGramSet(2) -two_grams.process(text) -print two_grams.gram_set +grams = NGramSet(sys.argv[2], int(sys.argv[1])) +grams.process() +print grams.gram_set -# generate stats about 'grams from gram set