"""Blogroll finder and aggregator""" __author__ = "Mark Pilgrim (mark@diveintomark.org)" __copyright__ = "Copyright 2002, Mark Pilgrim" __license__ = "Python" try: import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py timeoutsocket.setDefaultSocketTimeout(10) except: pass import urllib, urlparse, os, time, operator, sys, pickle, re, cgi from sgmllib import SGMLParser from threading import * BUFFERSIZE = 1024 IGNOREEXTS = ('.xml', '.opml', '.rss', '.rdf', '.pdf', '.doc') INCLUDEEXTS = ('', '.html', '.htm', '.shtml', '.php', '.asp', '.jsp') IGNOREDOMAINS = ('cgi.alexa.com', 'adserver1.backbeatmedia.com', 'ask.slashdot.org', 'freshmeat.net', 'readroom.ipl.org', 'www.amazon.com', 'www.ringsurf.com') def prettyURL(url): protocol, domain, path, params, query, fragment = urlparse.urlparse(url) if path == '/': path = '' return urlparse.urlunparse(('', domain, path, '', '', '')).replace('//', '') def simplifyURL(url): url = url.replace('www.', '') url = url.replace('/coming.html', '/') protocol, domain, path, params, query, fragment = urlparse.urlparse(url) if path == '': url = url + '/' return url class MinimalURLOpener(urllib.FancyURLopener): def __init__(self, *args): apply(urllib.FancyURLopener.__init__, (self,) + args) self.addheaders = [('User-agent', '')] def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): pass class BlogrollParser(SGMLParser): def __init__(self, url): SGMLParser.__init__(self) self.url = url self.reset() def reset(self): SGMLParser.reset(self) self.possible = [] self.blogroll = [] self.ina = 0 def _goodlink(self, href): protocol, domain, path, params, query, fragment = urlparse.urlparse(href) if protocol.lower() <> 'http': return 0 if self.url.find(domain) <> -1: return 0 if domain in IGNOREDOMAINS: return 0 if domain.find(':5335') <> -1: return 0 if domain.find('.google') <> -1: return 0 if fragment: return 0 shortpath, ext = os.path.splitext(path) ext = ext.lower() if ext in INCLUDEEXTS: return 1 if ext.lower() in IGNOREEXTS: return 0 # more rules here? return 1 def _confirmpossibles(self): if len(self.possible) >= 4: for url in self.possible: if url not in self.blogroll: self.blogroll.append(url) self.possible = [] def start_a(self, attrs): self.ina = 1 hreflist = [e[1] for e in attrs if e[0]=='href'] if not hreflist: return href = simplifyURL(hreflist[0]) if self._goodlink(href): self.possible.append(href) def end_a(self): self.ina = 0 def handle_data(self, data): if self.ina: return if data.strip(): self._confirmpossibles() def end_html(self, attrs): self.confirmpossibles() def getRadioBlogroll(url): try: usock = MinimalURLOpener().open('%s/gems/mySubscriptions.opml' % url) opmlSource = usock.read() usock.close() except: return [] if opmlSource.find(' 'http://': url = 'http://' + url radioBlogroll = getRadioBlogroll(url) if radioBlogroll: return radioBlogroll parser = BlogrollParser(url) try: usock = MinimalURLOpener().open(url) htmlSource = usock.read() usock.close() except: return [] parser.feed(htmlSource) return parser.blogroll class BlogrollThread(Thread): def __init__(self, master, url): Thread.__init__(self) self.master = master self.url = url def run(self): self.master.callback(self.url, getBlogroll(self.url)) class BlogrollThreadMaster: def __init__(self, url, recurse): self.blogrollDict = {} self.done = 0 if type(url)==type(''): blogroll = getBlogroll(url) else: blogroll = url self.run(blogroll, recurse) def callback(self, url, blogroll): if not self.done: self.blogrollDict[url] = blogroll def run(self, blogroll, recurse): ## print 'Analyzing %s sites' % len(blogroll) start = 0 end = 5 while 1: threads = [] for url in blogroll[start:end]: if not self.blogrollDict.has_key(url): t = BlogrollThread(self, url) threads.append(t) for t in threads: t.start() time.sleep(0.000001) for t in threads: ## print t.url time.sleep(0.000001) t.join(10) start += 5 end += 5 if start > len(blogroll): break ## print if recurse > 1: masterlist = reduce(operator.add, self.blogrollDict.values()) newlist = [url for url in masterlist if not self.blogrollDict.has_key(url)] self.run(newlist, recurse - 1) else: self.done = 1 def sortBlogrollData(blogrollDict): sortD = {} for blogroll in blogrollDict.values(): for url in blogroll: sortD[url] = sortD.setdefault(url, 0) + 1 sortI = [(v, k) for k, v in sortD.items()] sortI.sort() sortI.reverse() return sortI def blogdiff(sortI, baseUrl): blogroll = getBlogroll(baseUrl) newblogs = [(c, url) for c, url in sortI if url not in blogroll] return newblogs ##def savedata(data, filename): ## fsock = open(filename, 'w') ## pickle.dump(data, fsock) ## fsock.close() def trimdata(sortI, cutoff): return [(c, url) for c, url in sortI if c >= cutoff] def printdata(data): import pprint pprint.pprint(data) def printhtmllist(data): print "
    " for c, url in data: print """
  1. %s (%s recommendations)
  2. """ % (url, prettyURL(url), c) print "
" def gethtmltable(baseURL, data): output = [] output.append(""" """ % cgi.escape(baseURL), cgi.escape(baseURL)) for c, url in data: output.append("""""" % (url, prettyURL(url), c)) output.append("""
Neighborhood for %s
Name Links
%s%s
""") return "".join(output) def printhtmltable(baseURL, data): print gethtmltable(baseURL, data) def findNewBlogsByBlogroll(baseURL): blogrollDict = BlogrollThreadMaster(baseURL, 1).blogrollDict ## savedata(blogrollDict, r'c:\tmp\blogroll.dat') newblogrolldata = blogdiff(sortBlogrollData(blogrollDict), baseURL) ## savedata(newblogrolldata, r'c:\tmp\newblogroll.dat') printdata(newblogrolldata) def findNewBlogsByReferer(baseURL, refererList): blogrollDict = BlogrollThreadMaster(refererList, 1).blogrollDict ## savedata(blogrollDict, r'c:\tmp\blogrollbyreferer.dat') newblogrolldata = blogdiff(sortBlogrollData(blogrollDict), baseURL) ## savedata(newblogrolldata, r'c:\tmp\newblogrollbyreferer.dat') printdata(newblogrolldata) def findNewBlogsByStats(baseURL, statsFilename): fsock = open(statsFilename) data = pickle.load(fsock) fsock.close() toprefererlinks = data['toprefererlinks'] topdomainlinks = [rlist for path, rlist in toprefererlinks if path == '/'][0] refererList = [refurl for refcount, reftitle, refurl in topdomainlinks] findNewBlogsByReferer(baseURL, refererList) def getRelated(url): import google results = [] start = 0 for i in range(3): data = google.doGoogleSearch('related:%s' % url, start) results.extend([oneResult.URL for oneResult in data.results]) start += 10 if len(data.results) < 10: break return results def findNewBlogsByGoogleRelated(baseURL): relatedList = getRelated(baseURL) blogrollDict = BlogrollThreadMaster(relatedList, 1).blogrollDict ## savedata(blogrollDict, r'c:\tmp\blogrollbyrelated.dat') newblogrolldata = blogdiff(sortBlogrollData(blogrollDict), baseURL) ## savedata(newblogrolldata, r'c:\tmp\newblogrollbyrelated.dat') newblogrolldata = trimdata(newblogrolldata, 2) printhtmltable(baseURL, newblogrolldata) def findNeighborhood(baseURL): relatedList = getRelated(baseURL) blogrollDict = BlogrollThreadMaster(relatedList, 1).blogrollDict neighborhood = sortBlogrollData(blogrollDict) neighborhood = trimdata(neighborhood, 2) return gethtmltable(baseURL, neighborhood) if __name__ == '__main__': findNewBlogsByGoogleRelated(sys.argv[1])