ch15_popular_links.py
author deusx
Fri Sep 30 02:35:36 2005 +0000 (4 years ago)
branchhacking_rss_and_atom
changeset 2 f7a85b9fd48a
permissions -rw-r--r--
[svn r605] [HackingRssAndAtom]: Added a README
     1 #!/usr/bin/env python
     2 """
     3 ch15_popular_links.py
     4 
     5 Build a feed summarizing link popularity found in a set of feeds.
     6 """
     7 import sys, re, time, calendar, feedparser
     8 
     9 from scraperlib import FeedEntryDict
    10 from monitorfeedlib import LogBufferFeed
    11 from HTMLParser import HTMLParser, HTMLParseError
    12 
    13 FEED_TITLE     = 'Popular Links'
    14 FEED_TAGLINE   = 'Links found in feed entries ranked by popularity'
    15 FEED_NAME_FN   = "www/www.decafbad.com/docs/private-feeds/popular-links.%s"
    16 FEED_DIR       = 'popular-links-feed'
    17 FEEDS_FN       = 'popular-feed-uris.txt'
    18 MIN_LINKS      = 3
    19 MAX_ENTRY_AGE  = 3 * 24 * 60 * 60
    20 
    21 TITLE_TMPL     = """Popular links @ %(time)s (%(link_cnt)s links)"""
    22 TITLE_TIME_FMT = """%Y-%m-%d %H:%M"""
    23 
    24 CONTENT_TMPL = """
    25     <div>
    26         %s
    27     </div>
    28 """
    29 LINK_TMPL = """
    30     <div style="padding: 10px; margin: 10px; border: 1px solid #aaa;">
    31         <a style="font-size:1.25em" href="%(link)s">%(link).80s</a><br />
    32         <i style="font-size:0.75em">(%(link_cnt)s links)</i> 
    33         <ul>
    34             %(linkers)s
    35         </ul>
    36     </div>
    37 """
    38 LINKER_TMPL = """
    39     <li>
    40         <a href="%(entry.link)s">%(entry.title)s</a>
    41         <br />
    42         <span style="font-size:0.75em">
    43             [ <a href="%(feed.link)s">%(feed.title)s</a> ]
    44         </span>
    45     </li>
    46 """
    47 
    48 def main():
    49     """
    50     Scan all feeds and update the feed with a new link popularity
    51     report entry.
    52     """
    53     # Construct the feed generator.
    54     f = LogBufferFeed(FEED_DIR)
    55     f.MAX_AGE = 1 * 24 * 60 * 60 # 1 day
    56     f.FEED_META['feed.title']   = FEED_TITLE
    57     f.FEED_META['feed.tagline'] = FEED_TAGLINE
    58 
    59     # Load up the list of feeds.
    60     feed_uris  = [ x.strip() for x in open(FEEDS_FN,'r').readlines() ]
    61 
    62     # Skim for links from each feed, collect feed and entries in an
    63     # inverted index using link URLs as top-level keys.
    64     links = {}
    65     for feed_uri in feed_uris:
    66         feed_data = feedparser.parse(feed_uri)
    67     
    68         # Grab the feed metadata from parsed feed.
    69         feed      = feed_data.feed
    70         feed_link = feed.get('link', '#')
    71         
    72         # Process all entries for their links...
    73         for curr_entry in feed_data.entries:
    74             
    75             # HACK: Ignore entries without modification dates.
    76             # Maybe improve this by stashing seen dates in a DB.
    77             if curr_entry.get('modified_parsed', None) is None: 
    78                 continue
    79             
    80             # If the current entry is older than the max allowed age,
    81             # skip processing it.
    82             now = time.time()
    83             entry_time = calendar.timegm(curr_entry.modified_parsed)
    84             if (now - entry_time) > MAX_ENTRY_AGE:
    85                 continue
    86             
    87             # Build a LinkSkimmer and feed it all summary and HTML
    88             # content data from the current entry.  Ignore parse
    89             # errors in the interest of just grabbing what we can.
    90             skimmer = LinkSkimmer()
    91             try:
    92                 skimmer.feed(curr_entry.get('summary',''))
    93                 for c in curr_entry.get('content', []): 
    94                     skimmer.feed(c.value)
    95             except HTMLParseError:
    96                 pass
    97             
    98             # Process each link by adding the current feed and entry
    99             # under the link's key in the inverted index.
   100             for uri, cnt in skimmer.get_links():
   101                 if not links.has_key(uri): 
   102                     links[uri] = {}
   103                 if not links[uri].has_key(feed_link):
   104                     links[uri][feed_link] = (feed, curr_entry)
   105         
   106     # Turn the inverted index of links into a list of tuples, sort by
   107     # popularity of links as measured by number of linked entries.
   108     links_sorted = links.items()
   109     links_sorted.sort(lambda a,b: cmp(len(b[1].keys()), len(a[1].keys())))
   110     
   111     # Build the overall entry content from all the links.
   112     links_out = []
   113     for x in links_sorted:
   114         
   115         # Get the link and the list of linkers, skip this link if there
   116         # aren't enough linkers counted.
   117         link, linkers = x
   118         if len(linkers) < MIN_LINKS: continue
   119         
   120         # Build the list of linkers for this link by populating the
   121         # LINKER_TMPL string template.
   122         linkers_out = []
   123         for feed, entry in linkers.values():
   124             linkers_out.append(LINKER_TMPL % {
   125                 'feed.title'  : feed.get('title', 'untitled'),
   126                 'feed.link'   : feed.get('link', '#'),
   127                 'entry.title' : entry.get('title', 'untitled'),
   128                 'entry.link'  : entry.get('link', '#'),
   129             })
   130 
   131         # Build the content block for this link by populating the
   132         # LINK_TMPL string template.
   133         links_out.append(LINK_TMPL % {
   134             'link'       : link,
   135             'link_cnt'   : len(linkers),
   136             'linkers'    : '\n'.join(linkers_out)
   137         })
   138 
   139     # Complete building the content for this entry by populating the
   140     # CONTENT_TMPL string template.
   141     out = CONTENT_TMPL % '\n'.join(links_out)
   142     
   143     # Construct and append a new entry
   144     entry = FeedEntryDict({
   145         'title'   : TITLE_TMPL % {
   146             'link_cnt' : len(links_out),
   147             'time'     : time.strftime(TITLE_TIME_FMT)
   148         },
   149         'link'    : '',
   150         'summary' : out
   151     })
   152     f.append_entry(entry)
   153     
   154     # Output the current feed entries as both RSS and Atom
   155     open(FEED_NAME_FN % 'rss', 'w').write(f.scrape_rss())
   156     open(FEED_NAME_FN % 'atom', 'w').write(f.scrape_atom())
   157     
   158 class LinkSkimmer(HTMLParser):
   159     """
   160     Quick and dirty link harvester.
   161     """
   162     def reset(self):
   163         """Reset the parser and the list of links."""
   164         HTMLParser.reset(self)
   165         self.links = {}
   166         
   167     def get_links(self):
   168         """Return the links found as a list of tuples, link and count."""
   169         return self.links.items()
   170         
   171     def handle_starttag(self, tag, attrs_tup):
   172         """Harvest href attributes from link tags"""
   173         attrs = dict(attrs_tup)
   174         if tag == "a" and attrs.has_key('href'):
   175             self.links[attrs['href']] = self.links.get(attrs['href'], 0) + 1
   176     
   177 if __name__=='__main__': main()
   178