#!/usr/bin/env python
"""
ch15_popular_links.py
Build a feed summarizing link popularity found in a set of feeds.
"""
import sys, re, time, calendar, feedparser
from scraperlib import FeedEntryDict
from monitorfeedlib import LogBufferFeed
from HTMLParser import HTMLParser, HTMLParseError
FEED_TITLE = 'Popular Links'
FEED_TAGLINE = 'Links found in feed entries ranked by popularity'
FEED_NAME_FN = "www/www.decafbad.com/docs/private-feeds/popular-links.%s"
FEED_DIR = 'popular-links-feed'
FEEDS_FN = 'popular-feed-uris.txt'
MIN_LINKS = 3
MAX_ENTRY_AGE = 3 * 24 * 60 * 60
TITLE_TMPL = """Popular links @ %(time)s (%(link_cnt)s links)"""
TITLE_TIME_FMT = """%Y-%m-%d %H:%M"""
CONTENT_TMPL = """
%s
"""
LINK_TMPL = """
"""
LINKER_TMPL = """
%(entry.title)s
[ %(feed.title)s ]
"""
def main():
"""
Scan all feeds and update the feed with a new link popularity
report entry.
"""
# Construct the feed generator.
f = LogBufferFeed(FEED_DIR)
f.MAX_AGE = 1 * 24 * 60 * 60 # 1 day
f.FEED_META['feed.title'] = FEED_TITLE
f.FEED_META['feed.tagline'] = FEED_TAGLINE
# Load up the list of feeds.
feed_uris = [ x.strip() for x in open(FEEDS_FN,'r').readlines() ]
# Skim for links from each feed, collect feed and entries in an
# inverted index using link URLs as top-level keys.
links = {}
for feed_uri in feed_uris:
feed_data = feedparser.parse(feed_uri)
# Grab the feed metadata from parsed feed.
feed = feed_data.feed
feed_link = feed.get('link', '#')
# Process all entries for their links...
for curr_entry in feed_data.entries:
# HACK: Ignore entries without modification dates.
# Maybe improve this by stashing seen dates in a DB.
if curr_entry.get('modified_parsed', None) is None:
continue
# If the current entry is older than the max allowed age,
# skip processing it.
now = time.time()
entry_time = calendar.timegm(curr_entry.modified_parsed)
if (now - entry_time) > MAX_ENTRY_AGE:
continue
# Build a LinkSkimmer and feed it all summary and HTML
# content data from the current entry. Ignore parse
# errors in the interest of just grabbing what we can.
skimmer = LinkSkimmer()
try:
skimmer.feed(curr_entry.get('summary',''))
for c in curr_entry.get('content', []):
skimmer.feed(c.value)
except HTMLParseError:
pass
# Process each link by adding the current feed and entry
# under the link's key in the inverted index.
for uri, cnt in skimmer.get_links():
if not links.has_key(uri):
links[uri] = {}
if not links[uri].has_key(feed_link):
links[uri][feed_link] = (feed, curr_entry)
# Turn the inverted index of links into a list of tuples, sort by
# popularity of links as measured by number of linked entries.
links_sorted = links.items()
links_sorted.sort(lambda a,b: cmp(len(b[1].keys()), len(a[1].keys())))
# Build the overall entry content from all the links.
links_out = []
for x in links_sorted:
# Get the link and the list of linkers, skip this link if there
# aren't enough linkers counted.
link, linkers = x
if len(linkers) < MIN_LINKS: continue
# Build the list of linkers for this link by populating the
# LINKER_TMPL string template.
linkers_out = []
for feed, entry in linkers.values():
linkers_out.append(LINKER_TMPL % {
'feed.title' : feed.get('title', 'untitled'),
'feed.link' : feed.get('link', '#'),
'entry.title' : entry.get('title', 'untitled'),
'entry.link' : entry.get('link', '#'),
})
# Build the content block for this link by populating the
# LINK_TMPL string template.
links_out.append(LINK_TMPL % {
'link' : link,
'link_cnt' : len(linkers),
'linkers' : '\n'.join(linkers_out)
})
# Complete building the content for this entry by populating the
# CONTENT_TMPL string template.
out = CONTENT_TMPL % '\n'.join(links_out)
# Construct and append a new entry
entry = FeedEntryDict({
'title' : TITLE_TMPL % {
'link_cnt' : len(links_out),
'time' : time.strftime(TITLE_TIME_FMT)
},
'link' : '',
'summary' : out
})
f.append_entry(entry)
# Output the current feed entries as both RSS and Atom
open(FEED_NAME_FN % 'rss', 'w').write(f.scrape_rss())
open(FEED_NAME_FN % 'atom', 'w').write(f.scrape_atom())
class LinkSkimmer(HTMLParser):
"""
Quick and dirty link harvester.
"""
def reset(self):
"""Reset the parser and the list of links."""
HTMLParser.reset(self)
self.links = {}
def get_links(self):
"""Return the links found as a list of tuples, link and count."""
return self.links.items()
def handle_starttag(self, tag, attrs_tup):
"""Harvest href attributes from link tags"""
attrs = dict(attrs_tup)
if tag == "a" and attrs.has_key('href'):
self.links[attrs['href']] = self.links.get(attrs['href'], 0) + 1
if __name__=='__main__': main()