#!/usr/bin/python import os, sys sys.path.insert(0, '/home/mark/dev') import timeoutsocket timeoutsocket.setDefaultSocketTimeout(10) import urllib, random, cgi from sgmllib import SGMLParser MAXWORDS = 100 class WordParser(SGMLParser): def reset(self): SGMLParser.reset(self) self.words = [] self.capture=0 def start_body(self, attrs): self.capture = 1 def handle_data(self, data): if not self.capture: return if random.randint(0, 1): return self.words.extend(data.split()) if len(self.words) > MAXWORDS: self.words = self.words[:MAXWORDS] self.setnomoretags() def getWords(uri): try: usock = urllib.urlopen(uri) p = WordParser() p.feed(usock.read(100000)) p.close() return p.words except: return ['Sorry', 'an', 'error', 'occurred', 'trying', 'to', 'download', uri] def output(wordList): htmlIn = file('magnetic_poetry.html').read() htmlOut = "\n".join(["%s" % cgi.escape(word) for word in wordList]) return htmlIn.replace("", htmlOut) print 'Content-type: text/html' print # grab uri stuffed in by mod_rewrite fs = cgi.FieldStorage() remoteURI = fs["url"].value if not remoteURI.startswith('http://'): remoteURI = 'http://%s' % remoteURI print output(getWords(remoteURI))