make more app like, takes gram size as arg
This commit is contained in:
parent
304d676915
commit
903629e333
|
@ -4,15 +4,31 @@ import sys
|
||||||
import re
|
import re
|
||||||
|
|
||||||
class NGramSet:
|
class NGramSet:
|
||||||
def __init__(self, gram_size):
|
def __init__(self, filename, gram_size):
|
||||||
self.gram_size = gram_size
|
self.gram_size = gram_size
|
||||||
|
self.filename = filename
|
||||||
|
self.text = None
|
||||||
self.gram_set = {}
|
self.gram_set = {}
|
||||||
self.total_count = 0
|
self.total_count = 0
|
||||||
|
|
||||||
|
def read_file(self):
|
||||||
|
# open and read file
|
||||||
|
src = open(self.filename)
|
||||||
|
text = src.read()
|
||||||
|
|
||||||
|
# normalize text to remove extra space and characters
|
||||||
|
text = text.lower() # lower case chars
|
||||||
|
text = re.sub('[^a-z]', ' ', text) # remove anything not a-z
|
||||||
|
text = re.sub(' *', ' ', text) # shrink all multi spaces to a single
|
||||||
|
self.text= text
|
||||||
|
|
||||||
|
|
||||||
# takes in text and populates class with ngram info
|
# takes in text and populates class with ngram info
|
||||||
def process(self, text):
|
def process(self):
|
||||||
|
if (self.text == None):
|
||||||
|
self.read_file()
|
||||||
gram = []
|
gram = []
|
||||||
for word in text.split(" "):
|
for word in self.text.split(" "):
|
||||||
if (word == ''):
|
if (word == ''):
|
||||||
continue
|
continue
|
||||||
if (len(gram) >= self.gram_size):
|
if (len(gram) >= self.gram_size):
|
||||||
|
@ -31,26 +47,22 @@ class NGramSet:
|
||||||
for key in self.gram_set:
|
for key in self.gram_set:
|
||||||
self.gram_set[key]['percent'] = self.gram_set[key]['count'] / float(self.total_count)
|
self.gram_set[key]['percent'] = self.gram_set[key]['count'] / float(self.total_count)
|
||||||
|
|
||||||
if len(sys.argv) < 2:
|
def print_mysql(self):
|
||||||
print "Usage: gramificate.py [FILE]"
|
print "mysql"
|
||||||
|
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
print "Usage: gramificate.py [N] [FILE]"
|
||||||
|
print " N Gram size"
|
||||||
|
print " FILE Filename"
|
||||||
|
print "Output: Mysql commands to create a table FILE-N that contains all the grams and"
|
||||||
|
print " associated stats (count of gram, percent of total). Can be directly and"
|
||||||
|
print " safely piped into mysql:"
|
||||||
|
print " mysql -u USER -pPass -D gram_db < ./gramificate.py 2 input.txt"
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
# open and read file
|
|
||||||
src = open(sys.argv[1])
|
|
||||||
text = src.read()
|
|
||||||
|
|
||||||
# normalize text to remove extra space and characters
|
|
||||||
text = text.lower() # lower case chars
|
|
||||||
text = re.sub('[^a-z]', ' ', text) # remove anything not a-z
|
|
||||||
text = re.sub(' *', ' ', text) # shrink all multi spaces to a single
|
|
||||||
|
|
||||||
# generate ngrams
|
# generate ngrams
|
||||||
one_grams = NGramSet(1)
|
grams = NGramSet(sys.argv[2], int(sys.argv[1]))
|
||||||
one_grams.process(text)
|
grams.process()
|
||||||
print one_grams.gram_set
|
print grams.gram_set
|
||||||
two_grams = NGramSet(2)
|
|
||||||
two_grams.process(text)
|
|
||||||
print two_grams.gram_set
|
|
||||||
|
|
||||||
# generate stats about 'grams from gram set
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue