make more app like, takes gram size as arg

2013-01-10 12:09:03 -05:00 · 2013-01-10 12:09:03 -05:00 · 903629e333
parent 304d676915
commit 903629e333
1 changed files with 33 additions and 21 deletions
--- a/gramificate.py
+++ b/gramificate.py
@ -4,15 +4,31 @@ import sys
 import re
 class NGramSet:
-	def __init__(self, gram_size):
+	def __init__(self, filename, gram_size):
 		self.gram_size = gram_size
 		self.filename = filename
 		self.text = None
 		self.gram_set = {}
 		self.total_count = 0
 	def read_file(self):
 		# open and read file
 		src = open(self.filename)
 		text = src.read()
 		# normalize text to remove extra space and characters
 		text = text.lower() # lower case chars
 		text = re.sub('[^a-z]', ' ', text) # remove anything not a-z
 		text = re.sub('  *', ' ', text) # shrink all multi spaces to a single
 		self.text= text
 	# takes in text and populates class with ngram info
-	def process(self, text):
+	def process(self):
 		if (self.text == None):
 			self.read_file()
 		gram = []
-		for word in text.split(" "):
+		for word in self.text.split(" "):
 			if (word == ''):
 				continue
 			if (len(gram) >= self.gram_size):
@ -31,26 +47,22 @@ class NGramSet:
 		for key in self.gram_set:
 			self.gram_set[key]['percent'] = self.gram_set[key]['count'] / float(self.total_count)
-if len(sys.argv) < 2:
+	def print_mysql(self):
-	print "Usage: gramificate.py [FILE]"
+		print "mysql"
 if len(sys.argv) < 3:
 	print "Usage: gramificate.py [N] [FILE]"
 	print "  N     Gram size"
 	print "  FILE  Filename"
 	print "Output: Mysql commands to create a table FILE-N that contains all the grams and" 
 	print "        associated stats (count of gram, percent of total). Can be directly and"
 	print "        safely piped into mysql:"
 	print "          mysql -u USER -pPass -D gram_db < ./gramificate.py 2 input.txt" 
 	exit()
 # open and read file
 src = open(sys.argv[1])
 text = src.read()
 # normalize text to remove extra space and characters
 text = text.lower() # lower case chars
 text = re.sub('[^a-z]', ' ', text) # remove anything not a-z
 text = re.sub('  *', ' ', text) # shrink all multi spaces to a single
 # generate ngrams
-one_grams = NGramSet(1)
+grams = NGramSet(sys.argv[2], int(sys.argv[1]))
-one_grams.process(text)
+grams.process()
-print one_grams.gram_set
+print grams.gram_set
 two_grams = NGramSet(2)
 two_grams.process(text)
 print two_grams.gram_set
 # generate stats about 'grams from gram set