import MySQLdb from ConfigParser import ConfigParser import os, re, time WEBDIR = '/home/mark/web/diveintomark.org/' BASEDIR = os.path.join(WEBDIR, 'archives', 'citations') TEMPLATEDIR = '/home/mark/templates/citations/' CITATION_TEMPLATE = os.path.join(TEMPLATEDIR, 'citation') CITATION_INDEX = os.path.join(TEMPLATEDIR, 'citationindex') MINIMUM_NUMBER_OF_CITATIONS = 2 config = ConfigParser() config.read([os.path.expanduser('~/db.conf')]) connectionparams = {} for o in config.options('weblog'): connectionparams[o] = config.get('weblog', o) dbhandle = apply(MySQLdb.connect, (), connectionparams) cursor = dbhandle.cursor() def read(filename): fsock = open(filename, 'rb') data = fsock.read() fsock.close() return data def write(filename, data): fsock = open(filename, 'wb') fsock.write(data) fsock.close() def stripTags(htmlSource): """strip all HTML tags""" return "".join([e.split(">", 1).pop() for e in htmlSource.split("<")]).strip() def q(sql): cursor.execute(sql) def car(t): return t[0] columnNames = map(car, cursor.description) rows = [] for rowdata in cursor.fetchall(): thisrow = {} for name, value in map(None, columnNames, rowdata): thisrow[name] = value rows.append(thisrow) return rows def x(sql): cursor.execute(sql) def getCitationPosts(): return q("""select entry_id, entry_title, entry_text, entry_created_on, year(entry_created_on) as entry_year, lpad(concat(month(entry_created_on), ''), 2, '0') as entry_month, lpad(concat(dayofmonth(entry_created_on), ''), 2, '0') as entry_day from mt_entry where entry_blog_id = 3 and entry_status = 2 and entry_text regexp ''""") dirmap = (('%', ''), ('$', ''), (')', ''), ('.', ''), (';', ''), ('=', ''), ('(', ''), ('-', ''), ('/', ''), (':', ''), ('+', ''), ('#', ''), ('!', ''), ('?', ''), (',', ''), ("'", ''), (' ', '_'), ('__', '_')) def dirify(s): s = s.lower() for source, target in dirmap: s = s.replace(source, target) return s def getPostURL(year, month, day, title): return 'http://diveintomark.org/archives/%s/%s/%s/%s' % (year, month, day, dirify(title)) def getAllPostURLs(): return [getPostURL(row['entry_year'], row['entry_month'], row['entry_day'], row['entry_title']) for row in q("""select year(entry_created_on) as entry_year, lpad(concat(month(entry_created_on), ''), 2, '0') as entry_month, lpad(concat(dayofmonth(entry_created_on), ''), 2, '0') as entry_day, entry_title from mt_entry where entry_blog_id = 3 and entry_status = 2""")] citeMatch = re.compile(r'(?P.*?)', re.DOTALL) def aggregateCitations(rows): d = {} for row in rows: row['entry_url'] = getPostURL(row['entry_year'], row['entry_month'], row['entry_day'], row['entry_title']) entry_text = row['entry_text'] del row['entry_text'] for unused1, unused2, name in citeMatch.findall(entry_text): name = stripTags(name) if name: if not d.has_key(name): d[name] = [] entry_date = row['entry_created_on'].strftime('%Y%m%dT%H%M%S') if not [onedate for onedate, onerow in d[name] if onedate==entry_date]: d[name].append((entry_date, row)) items = [] for k, v in d.items(): if len(v) >= MINIMUM_NUMBER_OF_CITATIONS: v.sort() v.reverse() items.append((len(v), k, v)) items.sort() items.reverse() return items def outputCitation(name, cites): fullpath = os.path.join(BASEDIR, dirify(name)) if not os.path.exists(fullpath): os.makedirs(fullpath) fullpath = os.path.join(fullpath, 'index') citelist = "\n".join(["""
  • [%s] %s

  • """ % (params['entry_created_on'].strftime('%m/%d/%Y'), params['entry_url'], params['entry_title']) for dummy, params in cites]) write(fullpath, read(CITATION_TEMPLATE) % vars()) def output(agg): for c, name, cites in agg: outputCitation(name, cites) def outputIndex(agg): fullpath = os.path.join(BASEDIR, 'index') citelist = "\n".join(["""
  • %s (%s)
  • """ % (dirify(name), name, c) for c, name, cites in agg]) write(fullpath, read(CITATION_INDEX) % vars()) def zapIndividualCites(): all = getAllPostURLs() for url in all: fullpath = url.replace('http://diveintomark.org/', WEBDIR) olddata = read(fullpath) newdata = olddata.split('', 1)[0] + '\r\n' + olddata.split('', 1)[1] if olddata != newdata: write(fullpath, newdata) def outputIndividual(agg): zapIndividualCites() byurl = {} for c, name, cites in agg: for dummy, params in cites: url = params['entry_url'] if not byurl.has_key(url): byurl[url] = [] byurl[url].append(name) for url, names in byurl.items(): names.sort() output = [] for n in names: output.append(r'''
  • HomeArchivesCitations%s
  • ''' % (dirify(n), n)) fullpath = url.replace('http://diveintomark.org/', WEBDIR) olddata = read(fullpath) newdata = olddata.split('', 1)[0] + '' + '\n'.join(output) + '' + olddata.split('', 1)[1] if olddata != newdata: write(fullpath, newdata) def main(): posts = getCitationPosts() agg = aggregateCitations(posts) output(agg) outputIndex(agg) outputIndividual(agg) if __name__ == '__main__': main()