"""Get info on HTML page
Can retrieve page title, page language, and associated RSS feed
parseURL takes URL
parseHTML takes raw HTML source
>>> import htmlinfo
>>> htmlinfo.parseURL('http://diveintomark.org/')
{'lang': 'en',
'sublang': None,
'langname': 'English',
'title': 'dive into mark',
'url': 'http://diveintomark.org/',
'rss': 'http://diveintomark.org/xml/rss.xml'}
Many sites do not specify a page language. You can set the global
DEFAULTLANG attribute to sepcify a default language.
"""
__author__ = "Mark Pilgrim (mark@diveintomark.org)"
__version__ = "$Revision: 1.3 $"
__date__ = "$Date: 2002/08/08 18:26:27 $"
__copyright__ = "Copyright (c) 2002 Mark Pilgrim"
__license__ = "GPL"
try:
import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
timeoutsocket.setDefaultSocketTimeout(10)
except ImportError:
pass
import urllib, urlparse, sys, pprint
from sgmllib import SGMLParser
import iso639
DEFAULTLANG = None
class _MyParser(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.capturekey = None
self.info = {'url': None,
'lang': None,
'sublang': None,
'langname': None,
'rss': None,
'title': None}
def parseLangCode(self, langCode):
valuelist = langCode.split('-', 2)
if len(valuelist) == 2:
lang, sublang = valuelist
else:
lang = langCode
sublang = None
lang = lang.lower()
lang = iso639.commonMistakes.get(lang, lang)
langname = iso639.isoLang.get(lang, None)
return lang, sublang, langname
def setLangFromHTMLTag(self, attrs):
valuelist = [e[1] for e in attrs if e[0]=='xml:lang']
if not valuelist:
valuelist = [e[1] for e in attrs if e[0]=='lang']
if not valuelist: return
self.info["lang"], self.info["sublang"], self.info["langname"] = self.parseLangCode(valuelist[0])
def setLangFromMetaTag(self, attrs):
valuelist = [e[1] for e in attrs if e[0]=='content']
if not valuelist: return
self.info["lang"], self.info["sublang"], self.info["langname"] = self.parseLangCode(valuelist[0])
RSSTYPE = 'application/rss+xml'
def setRSSFromLinkTag(self, attrs):
valuelist = [e[1] for e in attrs if e[0]=='type']
if not valuelist: return
type = valuelist[0]
if type[:len(self.RSSTYPE)] != self.RSSTYPE: return
valuelist = [e[1] for e in attrs if e[0]=='href']
if valuelist:
self.info["rss"] = valuelist[0]
def start_html(self, attrs):
self.setLangFromHTMLTag(attrs)
def do_meta(self, attrs):
if not self.info["lang"]:
# this is /supposed/ to be specified by http-equiv="Content-Language" content="...",
# but lots of people use lowercase "content-language" instead, and still others use
# name="content-language" instead of http-equiv="content-language"
# bleah
attrmap = {}
for k, v in attrs:
attrmap[k.lower().replace('name', 'http-equiv')] = v.lower()
if attrmap.get('http-equiv', '') == 'content-language':
self.setLangFromMetaTag(attrs)
def do_link(self, attrs):
if ('rel', 'alternate') in attrs:
self.setRSSFromLinkTag(attrs)
def start_title(self, attrs):
self.startCapturing("title")
def end_title(self):
self.stopCapturing()
def startCapturing(self, key):
self.info[key] = ""
self.capturekey = key
def stopCapturing(self):
self.info[self.capturekey] = self.info[self.capturekey].strip()
self.capturekey = None
def handle_data(self, data):
if self.capturekey:
self.info[self.capturekey] = self.info[self.capturekey] + data
def end_head(self, attrs):
self.setnomoretags()
start_body = end_head
def _postprocess(parser, url=None):
info = parser.info
if not info["url"]:
info["url"] = url
if info["rss"] and info["url"]:
info["rss"] = urlparse.urljoin(info["url"], info["rss"])
if not info["lang"]:
info["lang"] = DEFAULTLANG
return info
def parseHTML(html, url=None):
parser = _MyParser()
try:
parser.feed(html)
except:
pass
return _postprocess(parser, url)
_BUFFERSIZE = 1024
def parseURL(url):
parser = _MyParser()
try:
usock = urllib.urlopen(url)
while 1:
buffer = usock.read(_BUFFERSIZE)
parser.feed(buffer)
if parser.nomoretags: break
if len(buffer) < _BUFFERSIZE: break
usock.close()
except:
pass
return _postprocess(parser, url)
def test(url):
data = parseURL(url)
pprint.pprint(data)
print
if __name__ == '__main__':
map(test, sys.argv[1:])