from sgmllib import SGMLParser import glob import cPickle import random class MarkovParser(SGMLParser): def reset(self): SGMLParser.reset(self) self.markov = {} self.wordbuffer = [] self.capture = 0 def start_div(self, attrs): if ("id", "main") in attrs: self.capture = 1 def end_div(self): self.capture = 0 def handle_data(self, data): if not self.capture: return self.wordbuffer.extend(data.split()) while len(self.wordbuffer) > 2: key = "%s %s" % (self.wordbuffer[0], self.wordbuffer[1]) self.markov[key] = self.markov.setdefault(key, []) + [self.wordbuffer[2]] del self.wordbuffer[0] def markovString(data): m = MarkovParser() m.feed(data) return m.markov def markovFile(filename): print filename fsock = open(filename) data = fsock.read() fsock.close() return markovString(data) def markovFiles(filenames): output = {} for filename in filenames: markov = markovFile(filename) for k, v in markov.items(): output[k] = output.setdefault(k, []) + v return output def createInput(): output = markovFiles(glob.glob(r'/home/mark/d/archives/*/*/*/*.html')) fsock = open(r'markov.dat', 'w') cPickle.dump(output, fsock) fsock.close() return output def output(markov, seed, length=10000): print seed, key = seed for i in range(length): word = random.choice(markov[key]) print word, if word[-1] in ['.', '?', '!', '"']: print key = "%s %s" % (key.split()[-1], word) if __name__ == '__main__': if 0: markov = createInput() else: fsock = open(r'markov.dat') markov = cPickle.load(fsock) fsock.close() import sys seed = sys.argv[1:] and sys.argv[1] or 'Sam Ruby' output(markov, seed)