Tantek Çelik: XHTML vs. the world. Bet on the world.

#!/usr/bin/python
import urllib, re, cgi

NUMBER_OF_POSTS = 5
EXCERPT_LENGTH = 40

def removeHTML(htmlSource):
    return "".join([e.split(">", 1).pop() for e in htmlSource.split("<")]).strip()

def geturl(url):
    usock = urllib.urlopen(url)
    data = usock.read()
    usock.close()
    return data

frontpagedata = geturl('http://www.tantek.com/log/')
refreshpattern = re.compile(r'''<meta http-equiv="REFRESH" content="0; URL=(.*?)"''')
relurl = refreshpattern.search(frontpagedata).group(1)
data = geturl('http://www.tantek.com/log/%s' % relurl)
itempattern = re.compile(r"""<h2>(.*?)<a href='(.*?)'.*?<div class='entry.*?>(.*?)</div>""", re.DOTALL)
print """Content-type: application/rss+xml

<?xml version="1.0"?>
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<title>Tantek's Log</title>
<description>Scraped for her pleasure</description>
<link>http://www.tantek.com/log/</link>"""
for title, rellink, description in itempattern.findall(data)[:NUMBER_OF_POSTS]:
    print """<item>"""
    print """<title>%s</title>""" % title.strip()
    print """<link>http://www.tantek.com/log/%s%s</link>""" % (relurl, rellink)
    print """<description>%s ...</description>""" % " ".join(removeHTML(description). split()[:EXCERPT_LENGTH]). replace('&lsaquo;#&rsaquo;', '')
    print """<content:encoded><![CDATA[%s]]></content:encoded>""" % description
    print """</item>"""
print """</channel>"""
print """</rss>"""

§

Respond privately

I am no longer accepting public comments on this post, but you can use this form to contact me privately. (Your message will not be published.)



§

firehosecodeplanet

© 2001–9 Mark Pilgrim