Initial commit
This commit is contained in:
		
						commit
						304d676915
					
				|  | @ -0,0 +1,56 @@ | |||
| #!/usr/bin/python | ||||
| 
 | ||||
| import sys | ||||
| import re | ||||
| 
 | ||||
| class NGramSet: | ||||
| 	def __init__(self, gram_size): | ||||
| 		self.gram_size = gram_size | ||||
| 		self.gram_set = {} | ||||
| 		self.total_count = 0 | ||||
| 
 | ||||
| 	# takes in text and populates class with ngram info | ||||
| 	def process(self, text): | ||||
| 		gram = [] | ||||
| 		for word in text.split(" "): | ||||
| 			if (word == ''): | ||||
| 				continue | ||||
| 			if (len(gram) >= self.gram_size): | ||||
| 				gram.pop(0) | ||||
| 			gram.append(word) | ||||
| 			if (len(gram) == self.gram_size): | ||||
| 				self.total_count += 1 | ||||
| 				key = ' '.join(gram) | ||||
| 				if (not self.gram_set.has_key(key)): | ||||
| 					self.gram_set[key] = {'gram': list(gram), 'count': 1} | ||||
| 				else: | ||||
| 					self.gram_set[key]['count'] += 1 | ||||
| 		self.generate_gram_stats() | ||||
| 
 | ||||
| 	def generate_gram_stats(self): | ||||
| 		for key in self.gram_set: | ||||
| 			self.gram_set[key]['percent'] = self.gram_set[key]['count'] / float(self.total_count) | ||||
| 
 | ||||
| if len(sys.argv) < 2: | ||||
| 	print "Usage: gramificate.py [FILE]" | ||||
| 	exit() | ||||
| 
 | ||||
| # open and read file | ||||
| src = open(sys.argv[1]) | ||||
| text = src.read() | ||||
| 
 | ||||
| # normalize text to remove extra space and characters | ||||
| text = text.lower() # lower case chars | ||||
| text = re.sub('[^a-z]', ' ', text) # remove anything not a-z | ||||
| text = re.sub('  *', ' ', text) # shrink all multi spaces to a single | ||||
| 
 | ||||
| # generate ngrams | ||||
| one_grams = NGramSet(1) | ||||
| one_grams.process(text) | ||||
| print one_grams.gram_set | ||||
| two_grams = NGramSet(2) | ||||
| two_grams.process(text) | ||||
| print two_grams.gram_set | ||||
| 
 | ||||
| # generate stats about 'grams from gram set | ||||
| 
 | ||||
		Loading…
	
		Reference in New Issue