#!/usr/bin/python __author__ = "Mark Pilgrim (mark@diveintomark.org)" __copyright__ = "Copyright 2002, Mark Pilgrim" __license__ = "GPL" __history__ = """ 1.0 - 10/10/2002 - MAP - initial release 1.1 - 10/11/2002 - MAP - fixed bug that would crash on CDATA sections (like that funky super-comment syntax the cool kids are using to hide Javascript from browsers these days). Thanks to Phil for being cool enough to crash my parser on its first day of deployment. 1.2 - 10/11/2002 - MAP - added permalink/anchor logic 1.3 - 10/11/2002 - MAP - collapse "www." domains, find more permalinks (inline in excerpt) """ try: import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py timeoutsocket.setDefaultSocketTimeout(10) except ImportError: pass import sgmllib, re, cgi, string, sys, urlparse class LinkbackParser(sgmllib.SGMLParser): """parses HTML source of a blog to excerpt text surrounding a link link to look for is given in targetURL link of HTML source is given in refererURL use feed() method to feed HTML source (can be called repeatedly with chunks of any size) use output() method to get excerpt use .permalink attribute to get permalink of excerpt (if found) use .anchor attribute to get anchor of excerpt on refererURL (use as fallback if no permalink) see getExcerpt function for usage example """ def __init__(self, refererURL, targetURL): sgmllib.SGMLParser.__init__(self) self.referer = refererURL self.refererDomain = urlparse.urlparse(self.referer)[1] self.refererDomain = self.refererDomain.replace('www.', '') self.target = targetURL def reset(self): self.pieces = [] self.started = 0 self.done = 0 self.anchor = None self.permalink = None sgmllib.SGMLParser.reset(self) def divider(self, attrs=[]): """restart excerpt if not started yet, stop if started, do nothing if done The way this works is that we parse through the entire HTML source, one tag at a time, from top to bottom. Along the way we collect the excerpt text (and tags). Once we hit the link to the target URL (handled in start_a, below), we set a flag (self.started) saying we've found the link. If we hit one of several divider tags (defined below) before we've found the target link, we just throw away everything we've gathered so far and start re-gathering the excerpt text and tags. If we hit one of the divider tags after we've found the target link, we set another flag (self.done) and freeze the excerpt. """ if self.done: return if not self.started: self.pieces = [] callingMethodName = sys._getframe(1).f_code.co_name tag = callingMethodName.split('_').pop() end_tag = callingMethodName.split('_')[0] == 'end' if end_tag: self.unknown_endtag(tag) else: self.unknown_starttag(tag, attrs) if self.started: self.done = 1 # divider tags def start_blockquote(self, attrs): self.divider(attrs) def end_blockquote(self): self.divider() def start_td(self, attrs): self.divider(attrs) def end_td(self): self.divider() def start_div(self, attrs): self.divider(attrs) def end_div(self): self.divider() def start_span(self, attrs): self.divider(attrs) def end_span(self): self.divider() def start_p(self, attrs): self.divider(attrs) def end_p(self): self.divider() def start_br(self, attrs): self.divider(attrs) def start_hr(self, attrs): self.divider(attrs) def start_pre(self, attrs): self.divider(attrs) def end_pre(self): self.divider() def start_h1(self, attrs): self.divider(attrs) def end_h1(self): self.divider() def start_h2(self, attrs): self.divider(attrs) def end_h2(self): self.divider() def start_h3(self, attrs): self.divider(attrs) def end_h3(self): self.divider() def start_h4(self, attrs): self.divider(attrs) def end_h4(self): self.divider() def start_h5(self, attrs): self.divider(attrs) def end_h5(self): self.divider() def start_h6(self, attrs): self.divider(attrs) def end_h6(self): self.divider() def _foundPermalink(self, href): href = urlparse.urljoin(self.referer, href) domain = urlparse.urlparse(href)[1] domain = domain.replace('www.', '') if domain == self.refererDomain: self.permalink = href elif (domain == 'scriptingnews.userland.com' and self.refererDomain in ('scripting.com', 'avantgo.userland.com')): self.permalink = href def start_a(self, attrs): href = [v for k,v in attrs if k=='href'] if href: href = href[0] if href.lower().find(self.target.lower()) <> -1: self.started = 1 if self.started and not self.done and not self.permalink: title = [v for k,v in attrs if k=='title'] if title: title = title[0].lower() if (title.find('permanent link') <> -1) or (title.find('permalink') <> -1): self._foundPermalink(href) if self.done and self.anchor and not self.permalink: if href.find(self.anchor) <> -1: self._foundPermalink(href) self.unknown_starttag('a', attrs) def unknown_starttag(self, tag, attrs): if self.done: return strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs]) self.pieces.append("<%(tag)s%(strattrs)s>" % locals()) if not self.started: anchor = [value for key, value in attrs if key in ('id', 'name')] if not anchor: return self.anchor = anchor[0] def unknown_endtag(self, tag): if self.done: return self.pieces.append("" % tag) def handle_charref(self, ref): if self.done: return self.pieces.append("&#%s;" % ref) def handle_entityref(self, ref): if self.done: return self.pieces.append("&%s;" % ref) def handle_data(self, text): if self.done: return self.pieces.append(text) def handle_comment(self, text): if self.done: return self.pieces.append("" % text) if not self.started: if text.find('" % text) def handle_decl(self, text): if self.done: return self.pieces.append("" % text) def output(self): """Return processed HTML as a single string""" return "".join(self.pieces) def parse_declaration(self, i): # override internal declaration handler to handle CDATA blocks if self.rawdata[i:i+9] == '', i) if k == -1: k = len(self.rawdata) self.handle_data(cgi.escape(self.rawdata[i+9:k])) return k+3 return sgmllib.SGMLParser.parse_declaration(self, i) def stripTags(htmlSource): """strip all HTML tags""" return "".join([e.split(">", 1).pop() for e in htmlSource.split("<")]).strip() def safechar(aChar): """map known illegal HTML characters to plain-text, filter out everything else""" o = ord(aChar) if o == 151: return ' -- ' if o == 146: return "'" if o in (147, 148): return '"' if ord(aChar) > 127: return '' else: return aChar def getExcerpt(htmlSource, refererURL, targetURL, stripHTML=1): """get excerpt surrounding link to targetURL in htmlSource""" p = LinkbackParser(refererURL, targetURL) p.feed(htmlSource) output = p.output() if stripHTML: output = stripTags(output) output = "".join(map(safechar, list(output))) if p.permalink: refererLink = p.permalink elif p.anchor: refererLink = refererURL.split('#', 1)[0] + "#" + p.anchor else: refererLink = refererURL return output, refererLink if __name__ == "__main__": import urllib, sys refererURL = sys.argv[1:] and sys.argv[1] or 'http://intertwingly.net/blog/' targetURL = sys.argv[2:] and sys.argv[2] or 'http://diveintomark.org/archives/2002/10/11.html' usock = urllib.urlopen(refererURL) data = usock.read() usock.close() output, refererLink = getExcerpt(data, refererURL, targetURL, 0) print 'excerpt:', output print 'refererLink:', refererLink