Initial commit

2013-01-09 09:40:49 -05:00 · 2013-01-09 09:40:49 -05:00 · 304d676915
commit 304d676915
1 changed files with 56 additions and 0 deletions
--- a/gramificate.py
+++ b/gramificate.py
@ -0,0 +1,56 @@
+#!/usr/bin/python
+
+import sys
+import re
+
+class NGramSet:
+	def __init__(self, gram_size):
+		self.gram_size = gram_size
+		self.gram_set = {}
+		self.total_count = 0
+
+	# takes in text and populates class with ngram info
+	def process(self, text):
+		gram = []
+		for word in text.split(" "):
+			if (word == ''):
+				continue
+			if (len(gram) >= self.gram_size):
+				gram.pop(0)
+			gram.append(word)
+			if (len(gram) == self.gram_size):
+				self.total_count += 1
+				key = ' '.join(gram)
+				if (not self.gram_set.has_key(key)):
+					self.gram_set[key] = {'gram': list(gram), 'count': 1}
+				else:
+					self.gram_set[key]['count'] += 1
+		self.generate_gram_stats()
+
+	def generate_gram_stats(self):
+		for key in self.gram_set:
+			self.gram_set[key]['percent'] = self.gram_set[key]['count'] / float(self.total_count)
+
+if len(sys.argv) < 2:
+	print "Usage: gramificate.py [FILE]"
+	exit()
+
+# open and read file
+src = open(sys.argv[1])
+text = src.read()
+
+# normalize text to remove extra space and characters
+text = text.lower() # lower case chars
+text = re.sub('[^a-z]', ' ', text) # remove anything not a-z
+text = re.sub('  *', ' ', text) # shrink all multi spaces to a single
+
+# generate ngrams
+one_grams = NGramSet(1)
+one_grams.process(text)
+print one_grams.gram_set
+two_grams = NGramSet(2)
+two_grams.process(text)
+print two_grams.gram_set
+
+# generate stats about 'grams from gram set
+