"""Blogroll finder and aggregator""" __author__ = "Mark Pilgrim (mark@diveintomark.org)" __copyright__ = "Copyright 2002, Mark Pilgrim" __license__ = "Python" try: import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py timeoutsocket.setDefaultSocketTimeout(10) except: pass import urllib, urlparse, os, time, operator, sys, pickle, re, cgi, time from sgmllib import SGMLParser from threading import * BUFFERSIZE = 1024 IGNOREEXTS = ('.xml', '.opml', '.rss', '.rdf', '.pdf', '.doc') INCLUDEEXTS = ('', '.html', '.htm', '.shtml', '.php', '.asp', '.jsp') IGNOREDOMAINS = ('cgi.alexa.com', 'adserver1.backbeatmedia.com', 'ask.slashdot.org', 'freshmeat.net', 'readroom.ipl.org', 'amazon.com', 'ringsurf.com') def prettyURL(url): protocol, domain, path, params, query, fragment = urlparse.urlparse(url) if path == '/': path = '' return urlparse.urlunparse(('', domain, path, '', '', '')).replace('//', '') def simplifyURL(url): url = url.replace('www.', '') url = url.replace('/coming.html', '/') protocol, domain, path, params, query, fragment = urlparse.urlparse(url) if path == '': url = url + '/' return url class MinimalURLOpener(urllib.FancyURLopener): def __init__(self, *args): apply(urllib.FancyURLopener.__init__, (self,) + args) self.addheaders = [('User-agent', '')] def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): pass class BlogrollParser(SGMLParser): def __init__(self, url): SGMLParser.__init__(self) self.url = url self.reset() def reset(self): SGMLParser.reset(self) self.possible = [] self.blogroll = [] self.ina = 0 def _goodlink(self, href): protocol, domain, path, params, query, fragment = urlparse.urlparse(href) if protocol.lower() <> 'http': return 0 if self.url.find(domain) <> -1: return 0 if domain in IGNOREDOMAINS: return 0 if domain.find(':5335') <> -1: return 0 if domain.find('.google') <> -1: return 0 if fragment: return 0 shortpath, ext = os.path.splitext(path) ext = ext.lower() if ext in INCLUDEEXTS: return 1 if ext.lower() in IGNOREEXTS: return 0 # more rules here? return 1 def _confirmpossibles(self): if len(self.possible) >= 4: for url in self.possible: if url not in self.blogroll: self.blogroll.append(url) self.possible = [] def start_a(self, attrs): self.ina = 1 hreflist = [e[1] for e in attrs if e[0]=='href'] if not hreflist: return href = simplifyURL(hreflist[0]) if self._goodlink(href): self.possible.append(href) def end_a(self): self.ina = 0 def handle_data(self, data): if self.ina: return if data.strip(): self._confirmpossibles() def end_html(self, attrs): self.confirmpossibles() def getRadioBlogroll(url): try: usock = MinimalURLOpener().open('%s/gems/mySubscriptions.opml' % url) opmlSource = usock.read() usock.close() except: return [] if opmlSource.find(' 'http://': url = 'http://' + url radioBlogroll = getRadioBlogroll(url) if radioBlogroll: return radioBlogroll parser = BlogrollParser(url) try: usock = MinimalURLOpener().open(url) htmlSource = usock.read() usock.close() except: return [] parser.feed(htmlSource) return parser.blogroll class BlogrollThread(Thread): def __init__(self, master, url): Thread.__init__(self) self.master = master self.url = url def run(self): self.master.callback(self.url, getBlogroll(self.url)) class BlogrollThreadMaster: def __init__(self, url, recurse): self.blogrollDict = {} self.done = 0 if type(url)==type(''): blogroll = getBlogroll(url) else: blogroll = url self.run(blogroll, recurse) def callback(self, url, blogroll): if not self.done: self.blogrollDict[url] = blogroll def run(self, blogroll, recurse): start = 0 end = 5 while 1: threads = [] for url in blogroll[start:end]: if not self.blogrollDict.has_key(url): t = BlogrollThread(self, url) threads.append(t) for t in threads: t.start() time.sleep(0.000001) for t in threads: time.sleep(0.000001) t.join(10) start += 5 end += 5 if start > len(blogroll): break if recurse > 1: masterlist = reduce(operator.add, self.blogrollDict.values()) newlist = [url for url in masterlist if not self.blogrollDict.has_key(url)] self.run(newlist, recurse - 1) else: self.done = 1 def sortBlogrollData(blogrollDict): sortD = {} for blogroll in blogrollDict.values(): for url in blogroll: sortD[url] = sortD.setdefault(url, 0) + 1 sortI = [(v, k) for k, v in sortD.items()] sortI.sort() sortI.reverse() return sortI def trimdata(sortI, cutoff): return [(c, url) for c, url in sortI if c >= cutoff] def getRelated(url): import google results = [] start = 0 for i in range(3): data = google.doGoogleSearch('related:%s' % url, start) results.extend([oneResult.URL for oneResult in data.results]) start += 10 if len(data.results) < 10: break return results def getNeighborhood(baseURL): relatedList = getRelated(baseURL) blogrollDict = BlogrollThreadMaster(relatedList, 1).blogrollDict neighborhood = sortBlogrollData(blogrollDict) neighborhood = trimdata(neighborhood, 2) neighborhood = [(c,url, prettyURL(url)) for c,url in neighborhood] return neighborhood def render_html(baseURL, data): output = [] output.append(""" """ % (cgi.escape(prettyURL(baseURL)), cgi.escape(prettyURL(baseURL)))) for c, url, title in data: output.append("""""" % (url, title, c, 'http://diveintomark.org/cgi-bin/neighborhood.cgi?url=%s' % cgi.escape(url))) output.append("""
Neighborhood for %s
Name Links Explore
%s%sexplore
""") return "".join(output) def render_rss(baseURL, data): title = prettyURL(baseURL) channeltitle = "%s neighborhood" % title localtime = time.strftime('%Y-%m-%dT%H:%M:%S-05:00', time.localtime()) output = [] output.append(""" %(channeltitle)s %(baseURL)s Sites in the virtual neighborhood of %(title)s en-us %(localtime)s %(localtime)s weekly 1 2000-01-01T12:00+00:00 """ % locals()) ##""" for c, url, title in data: output.append(""" """ % url) output.append(""" """) for c, url, title in data: output.append(""" %(title)s %(url)s %(c)s links """ % locals()) output.append("""""") return "".join(output) if __name__ == '__main__': print render_html(getNeighborhood(sys.argv[1]))