#!/usr/bin/python from __future__ import generators import string, re, cPickle, random class NGram: def __init__(self, n=2): self.reset(n) def reset(self, n=2): self.n = n self.ngrams = {} self.start_context = ["^"]*n def learn(self, filename): context = self.start_context[:] for w in get_words(filename): if len(context) and context[-1] == ".": context = self.start_context[:] if len(context) < self.n: context.append(w) continue k = tuple(context) if k not in self.ngrams: self.ngrams[k] = {w:1} else: if w not in self.ngrams[k]: self.ngrams[k][w] = 1 else: self.ngrams[k][w] += 1 context.append(w) del context[0] self.renormalize() def load(self, filename): fp = open(filename, "r") self.ngrams = cPickle.load(fp) self.renormalize() fp.close() def save(self, filename): fp = open(filename, "w") cPickle.dump(self.ngrams, fp) fp.close() def renormalize(self): # normalize frequencies self.normalized = {} for k in self.ngrams.keys(): total = float(sum(self.ngrams[k].values())) self.normalized[k] = dict([(w, self.ngrams[k][w]/total) for w in self.ngrams[k].keys()]) def generate(self): sentence = self.start_context[:] while sentence[-1] != ".": context = tuple(sentence[-(self.n):]) sentence.append(self.generate_word(context)) s = " ".join(sentence[self.n:-1])+"." s = s[0].upper()+s[1:] return s def generate_word(self, context): ng = self.normalized[context] n = random.uniform(0,1) for w in ng.keys(): if n0, l.lower().split(" ")): yield w return if __name__ == "__main__": NG = NGram() NG.learn("art.asc") #NG.load("bible1-2.dat") #NG.load("taalboek-2.dat") for i in range(30): print NG.generate(),