"""Get info on HTML page Can retrieve page title, page language, and associated RSS feed parseURL takes URL parseHTML takes raw HTML source >>> import htmlinfo >>> htmlinfo.parseURL('http://diveintomark.org/') {'lang': 'en', 'sublang': None, 'langname': 'English', 'title': 'dive into mark', 'url': 'http://diveintomark.org/', 'rss': 'http://diveintomark.org/xml/rss.xml'} Many sites do not specify a page language. You can set the global DEFAULTLANG attribute to sepcify a default language. """ __author__ = "Mark Pilgrim (mark@diveintomark.org)" __version__ = "$Revision: 1.3 $" __date__ = "$Date: 2002/08/08 18:26:27 $" __copyright__ = "Copyright (c) 2002 Mark Pilgrim" __license__ = "GPL" try: import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py timeoutsocket.setDefaultSocketTimeout(10) except ImportError: pass import urllib, urlparse, sys, pprint from sgmllib import SGMLParser import iso639 DEFAULTLANG = None class _MyParser(SGMLParser): def reset(self): SGMLParser.reset(self) self.capturekey = None self.info = {'url': None, 'lang': None, 'sublang': None, 'langname': None, 'rss': None, 'title': None} def parseLangCode(self, langCode): valuelist = langCode.split('-', 2) if len(valuelist) == 2: lang, sublang = valuelist else: lang = langCode sublang = None lang = lang.lower() lang = iso639.commonMistakes.get(lang, lang) langname = iso639.isoLang.get(lang, None) return lang, sublang, langname def setLangFromHTMLTag(self, attrs): valuelist = [e[1] for e in attrs if e[0]=='xml:lang'] if not valuelist: valuelist = [e[1] for e in attrs if e[0]=='lang'] if not valuelist: return self.info["lang"], self.info["sublang"], self.info["langname"] = self.parseLangCode(valuelist[0]) def setLangFromMetaTag(self, attrs): valuelist = [e[1] for e in attrs if e[0]=='content'] if not valuelist: return self.info["lang"], self.info["sublang"], self.info["langname"] = self.parseLangCode(valuelist[0]) RSSTYPE = 'application/rss+xml' def setRSSFromLinkTag(self, attrs): valuelist = [e[1] for e in attrs if e[0]=='type'] if not valuelist: return type = valuelist[0] if type[:len(self.RSSTYPE)] != self.RSSTYPE: return valuelist = [e[1] for e in attrs if e[0]=='href'] if valuelist: self.info["rss"] = valuelist[0] def start_html(self, attrs): self.setLangFromHTMLTag(attrs) def do_meta(self, attrs): if not self.info["lang"]: # this is /supposed/ to be specified by http-equiv="Content-Language" content="...", # but lots of people use lowercase "content-language" instead, and still others use # name="content-language" instead of http-equiv="content-language" # bleah attrmap = {} for k, v in attrs: attrmap[k.lower().replace('name', 'http-equiv')] = v.lower() if attrmap.get('http-equiv', '') == 'content-language': self.setLangFromMetaTag(attrs) def do_link(self, attrs): if ('rel', 'alternate') in attrs: self.setRSSFromLinkTag(attrs) def start_title(self, attrs): self.startCapturing("title") def end_title(self): self.stopCapturing() def startCapturing(self, key): self.info[key] = "" self.capturekey = key def stopCapturing(self): self.info[self.capturekey] = self.info[self.capturekey].strip() self.capturekey = None def handle_data(self, data): if self.capturekey: self.info[self.capturekey] = self.info[self.capturekey] + data def end_head(self, attrs): self.setnomoretags() start_body = end_head def _postprocess(parser, url=None): info = parser.info if not info["url"]: info["url"] = url if info["rss"] and info["url"]: info["rss"] = urlparse.urljoin(info["url"], info["rss"]) if not info["lang"]: info["lang"] = DEFAULTLANG return info def parseHTML(html, url=None): parser = _MyParser() try: parser.feed(html) except: pass return _postprocess(parser, url) _BUFFERSIZE = 1024 def parseURL(url): parser = _MyParser() try: usock = urllib.urlopen(url) while 1: buffer = usock.read(_BUFFERSIZE) parser.feed(buffer) if parser.nomoretags: break if len(buffer) < _BUFFERSIZE: break usock.close() except: pass return _postprocess(parser, url) def test(url): data = parseURL(url) pprint.pprint(data) print if __name__ == '__main__': map(test, sys.argv[1:])