#!/usr/bin/python import sys import re class NGramSet: def __init__(self, filename, gram_size): self.gram_size = gram_size self.filename = filename self.text = None self.gram_set = {} self.total_count = 0 def read_file(self): # open and read file src = open(self.filename) text = src.read() # normalize text to remove extra space and characters text = text.lower() # lower case chars text = re.sub('[^a-z]', ' ', text) # remove anything not a-z text = re.sub(' *', ' ', text) # shrink all multi spaces to a single self.text= text # takes in text and populates class with ngram info def process(self): if (self.text == None): self.read_file() gram = [] for word in self.text.split(" "): if (word == ''): continue if (len(gram) >= self.gram_size): gram.pop(0) gram.append(word) if (len(gram) == self.gram_size): self.total_count += 1 key = ' '.join(gram) if (not self.gram_set.has_key(key)): self.gram_set[key] = {'gram': list(gram), 'count': 1} else: self.gram_set[key]['count'] += 1 self.generate_gram_stats() def generate_gram_stats(self): for key in self.gram_set: self.gram_set[key]['percent'] = self.gram_set[key]['count'] / float(self.total_count) def print_mysql(self): table_name = re.sub('[^A-Za-z0-9]', '_', "%s_%d" % (self.filename, self.gram_size)) print "DROP TABLE IF EXISTS %s;" % (table_name) print "CREATE TABLE %s (gram VARCHAR(255), count INT DEFAULT 0, percent FLOAT DEFAULT 0.0, PRIMARY KEY(gram));" % (table_name) for key in self.gram_set: print "INSERT INTO %s VALUES (\"%s\", %d, %f);" % (table_name, key, self.gram_set[key]['count'], self.gram_set[key]['percent']) if len(sys.argv) < 3: print "Usage: gramificate.py [N] [FILE]*" print " N Gram size" print " FILE Filename" print "Output: Mysql commands to create a table FILE_N that contains all the grams and" print " associated stats (count of gram, percent of total). Can be directly and" print " safely piped into mysql:" print " ./gramificate.py 2 input.txt | mysql -u USER -pPass -D gram_db" print " and you will end up with a table \"input_txt_2\" with the results" print "Also accepts multiple files, so usage on test docs can be done as follows:" print "./gramificate.py 1 test_docs/* | mysql -u USER -pPass -D gram_db" exit() # generate ngrams gram_size = int(sys.argv[1]) for i in range(2,len(sys.argv)): grams = NGramSet(sys.argv[i], int(sys.argv[1])) grams.process() grams.print_mysql()