From 304d6769156b0ceb15efe1819edf9900f7261fb6 Mon Sep 17 00:00:00 2001 From: Dan Ballard Date: Wed, 9 Jan 2013 09:40:49 -0500 Subject: [PATCH] Initial commit --- gramificate.py | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100755 gramificate.py diff --git a/gramificate.py b/gramificate.py new file mode 100755 index 0000000..810fddf --- /dev/null +++ b/gramificate.py @@ -0,0 +1,56 @@ +#!/usr/bin/python + +import sys +import re + +class NGramSet: + def __init__(self, gram_size): + self.gram_size = gram_size + self.gram_set = {} + self.total_count = 0 + + # takes in text and populates class with ngram info + def process(self, text): + gram = [] + for word in text.split(" "): + if (word == ''): + continue + if (len(gram) >= self.gram_size): + gram.pop(0) + gram.append(word) + if (len(gram) == self.gram_size): + self.total_count += 1 + key = ' '.join(gram) + if (not self.gram_set.has_key(key)): + self.gram_set[key] = {'gram': list(gram), 'count': 1} + else: + self.gram_set[key]['count'] += 1 + self.generate_gram_stats() + + def generate_gram_stats(self): + for key in self.gram_set: + self.gram_set[key]['percent'] = self.gram_set[key]['count'] / float(self.total_count) + +if len(sys.argv) < 2: + print "Usage: gramificate.py [FILE]" + exit() + +# open and read file +src = open(sys.argv[1]) +text = src.read() + +# normalize text to remove extra space and characters +text = text.lower() # lower case chars +text = re.sub('[^a-z]', ' ', text) # remove anything not a-z +text = re.sub(' *', ' ', text) # shrink all multi spaces to a single + +# generate ngrams +one_grams = NGramSet(1) +one_grams.process(text) +print one_grams.gram_set +two_grams = NGramSet(2) +two_grams.process(text) +print two_grams.gram_set + +# generate stats about 'grams from gram set +