#!/usr/bin/env python """ ch07_feedmaker.py Create an RSS feed from a collection of HTML documents """ import sys, time, urlparse, urllib, htmlmetalib from xml.sax.saxutils import escape BASE_HREF = 'http://www.example.com' TAG_DOMAIN = 'example.com' MAX_ENTRIES = 15 FEED_META = { 'feed.title' : 'A Sample Feed', 'feed.link' : 'http://www.example.com', 'feed.tagline' : 'This is a testing sample feed.', 'feed.author.name' : 'l.m.orchard', 'feed.author.email' : 'l.m.orchard@pobox.com', 'feed.author.url' : 'http://www.decafbad.com' } ATOM_FEED_TMPL = """ %(feed.title)s %(feed.tagline)s %(feed.modified)s %(feed.author.name)s %(feed.author.email)s %(feed.author.url)s %(feed.entries)s """ ATOM_ENTRY_TMPL = """ %(entry.title)s %(entry.modified)s %(entry.modified)s %(entry.id)s %(entry.content)s """ RSS_FEED_TMPL = """ %(feed.title)s %(feed.link)s %(feed.tagline)s %(feed.author.email)s %(feed.entries)s """ RSS_ENTRY_TMPL = """ %(entry.title)s %(entry.link)s %(entry.modified)s %(entry.id)s %(entry.summary)s """ def main(): """ Find all HTML documents in a given path and produce a syndication feed based on the pages' metadata. """ #FEED_TMPL = RSS_FEED_TMPL #ENTRY_TMPL = RSS_ENTRY_TMPL #doc_wrapper = RSSTemplateDocWrapper FEED_TMPL = ATOM_FEED_TMPL ENTRY_TMPL = ATOM_ENTRY_TMPL doc_wrapper = AtomTemplateDocWrapper # Find all the HTML docs. docs = htmlmetalib.findHTML(sys.argv[1]) # Bundle all the HTML doc objects in template-friendly wrappers. entries = [ doc_wrapper(BASE_HREF, TAG_DOMAIN, d) for d in docs ] entries.sort() # Build a map for the feed template. data_out = {} data_out.update(FEED_META) entries_out = [ENTRY_TMPL % e for e in entries[:MAX_ENTRIES]] data_out['feed.modified'] = \ time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) data_out['feed.entries'] = "".join(entries_out) # Handle optional parameter to output to a file if len(sys.argv) > 2: fout = open(sys.argv[2], "w") else: fout = sys.stdout # Fill out the feed template and output. fout.write(FEED_TMPL % data_out) class TemplateDocWrapper: """ This class is a wrapper around HTMLMetaDoc objects meant to facilitate easy use in XML template strings. """ UNICODE_ENC = "UTF-8" MODIFIED_FMT = "%Y-%m-%dT%H:%M:%SZ" def __init__(self, base_href, tag_domain, doc): """Initialize the wrapper""" self._base_href = base_href self._tag_domain = tag_domain self._doc = doc def __cmp__(self, other): """Use the docs' comparison method.""" return cmp(self._doc, other._doc) def __getitem__(self, name): """ Translate map-like access from a template into proper values based on document attributes. """ if name == "entry.title": # Return the document's title. val = self._doc.title elif name == "entry.summary": # Return the document's description val = self._doc.description elif name == "entry.content": # Return the document's content val = self._doc.content elif name == "entry.link": # Construct entry's link from document path and base HREF val = urlparse.urljoin(self._base_href, self._doc.path) elif name == "entry.modified": # Use the class modified time format to create the string val = time.strftime(self.MODIFIED_FMT, time.gmtime(self._doc.modified)) elif name == "entry.id": # Construct a canonical tag URI for the entry GUID ymd = time.strftime("%Y-%m-%d", time.gmtime(self._doc.modified)) val = "tag:%s,%s:%s" % (self._tag_domain, ymd, urllib.quote(self._doc.path,'')) else: # Who knows what the template wanted? val = "(undefined)" # Make sure the value is finally safe for inclusion in XML return escape(val.encode(self.UNICODE_ENC)) class AtomTemplateDocWrapper(TemplateDocWrapper): """Template wrapper for Atom-style entries""" MODIFIED_FMT = "%Y-%m-%dT%H:%M:%SZ" class RSSTemplateDocWrapper(TemplateDocWrapper): """Template wrapper for RSS-style entries""" MODIFIED_FMT = "%a, %d %b %Y %H:%M:%S %z" if __name__ == "__main__": main()