79 lines
2.5 KiB
Python
Executable File
79 lines
2.5 KiB
Python
Executable File
#!/usr/bin/python
|
|
|
|
import sys
|
|
import re
|
|
|
|
class NGramSet:
|
|
def __init__(self, filename, gram_size):
|
|
self.gram_size = gram_size
|
|
self.filename = filename
|
|
self.text = None
|
|
self.gram_set = {}
|
|
self.total_count = 0
|
|
|
|
def read_file(self):
|
|
# open and read file
|
|
src = open(self.filename)
|
|
text = src.read()
|
|
|
|
# normalize text to remove extra space and characters
|
|
text = text.lower() # lower case chars
|
|
text = re.sub('[^a-z]', ' ', text) # remove anything not a-z
|
|
text = re.sub(' *', ' ', text) # shrink all multi spaces to a single
|
|
self.text= text
|
|
|
|
|
|
# takes in text and populates class with ngram info
|
|
def process(self):
|
|
if (self.text == None):
|
|
self.read_file()
|
|
gram = []
|
|
for word in self.text.split(" "):
|
|
if (word == ''):
|
|
continue
|
|
if (len(gram) >= self.gram_size):
|
|
gram.pop(0)
|
|
gram.append(word)
|
|
if (len(gram) == self.gram_size):
|
|
self.total_count += 1
|
|
key = ' '.join(gram)
|
|
if (not self.gram_set.has_key(key)):
|
|
self.gram_set[key] = {'gram': list(gram), 'count': 1}
|
|
else:
|
|
self.gram_set[key]['count'] += 1
|
|
self.generate_gram_stats()
|
|
|
|
def generate_gram_stats(self):
|
|
for key in self.gram_set:
|
|
self.gram_set[key]['percent'] = self.gram_set[key]['count'] / float(self.total_count)
|
|
|
|
def print_mysql(self):
|
|
table_name = re.sub('[^A-Za-z0-9]', '_', "%s_%d" % (self.filename, self.gram_size))
|
|
print "DROP TABLE IF EXISTS %s;" % (table_name)
|
|
print "CREATE TABLE %s (gram VARCHAR(255), count INT DEFAULT 0, percent FLOAT DEFAULT 0.0, PRIMARY KEY(gram));" % (table_name)
|
|
for key in self.gram_set:
|
|
print "INSERT INTO %s VALUES (\"%s\", %d, %f);" % (table_name, key, self.gram_set[key]['count'], self.gram_set[key]['percent'])
|
|
|
|
if len(sys.argv) < 3:
|
|
print "Usage: gramificate.py [N] [FILE]*"
|
|
print " N Gram size"
|
|
print " FILE Filename"
|
|
print "Output: Mysql commands to create a table FILE_N that contains all the grams and"
|
|
print " associated stats (count of gram, percent of total). Can be directly and"
|
|
print " safely piped into mysql:"
|
|
print " ./gramificate.py 2 input.txt | mysql -u USER -pPass -D gram_db"
|
|
print " and you will end up with a table \"input_txt_2\" with the results"
|
|
print "Also accepts multiple files, so usage on test docs can be done as follows:"
|
|
print "./gramificate.py 1 test_docs/* | mysql -u USER -pPass -D gram_db"
|
|
exit()
|
|
|
|
# generate ngrams
|
|
gram_size = int(sys.argv[1])
|
|
|
|
for i in range(2,len(sys.argv)):
|
|
grams = NGramSet(sys.argv[i], int(sys.argv[1]))
|
|
grams.process()
|
|
grams.print_mysql()
|
|
|
|
|