Tantek Çelik: XHTML vs. the world. Bet on the world.
#!/usr/bin/python
import urllib, re, cgi
NUMBER_OF_POSTS = 5
EXCERPT_LENGTH = 40
def removeHTML(htmlSource):
return "".join([e.split(">", 1).pop() for e in htmlSource.split("<")]).strip()
def geturl(url):
usock = urllib.urlopen(url)
data = usock.read()
usock.close()
return data
frontpagedata = geturl('http://www.tantek.com/log/')
refreshpattern = re.compile(r'''<meta http-equiv="REFRESH" content="0; URL=(.*?)"''')
relurl = refreshpattern.search(frontpagedata).group(1)
data = geturl('http://www.tantek.com/log/%s' % relurl)
itempattern = re.compile(r"""<h2>(.*?)<a href='(.*?)'.*?<div class='entry.*?>(.*?)</div>""", re.DOTALL)
print """Content-type: application/rss+xml
<?xml version="1.0"?>
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<title>Tantek's Log</title>
<description>Scraped for her pleasure</description>
<link>http://www.tantek.com/log/</link>"""
for title, rellink, description in itempattern.findall(data)[:NUMBER_OF_POSTS]:
print """<item>"""
print """<title>%s</title>""" % title.strip()
print """<link>http://www.tantek.com/log/%s%s</link>""" % (relurl, rellink)
print """<description>%s ...</description>""" % " ".join(removeHTML(description). split()[:EXCERPT_LENGTH]). replace('‹#›', '')
print """<content:encoded><![CDATA[%s]]></content:encoded>""" % description
print """</item>"""
print """</channel>"""
print """</rss>"""
§
I am no longer accepting public comments on this post, but you can use this form to contact me privately. (Your message will not be published.)
§
© 2001–9 Mark Pilgrim