5 Build a feed summarizing link popularity found in a set of feeds.
7 import sys, re, time, calendar, feedparser
9 from scraperlib import FeedEntryDict
10 from monitorfeedlib import LogBufferFeed
11 from HTMLParser import HTMLParser, HTMLParseError
13 FEED_TITLE = 'Popular Links'
14 FEED_TAGLINE = 'Links found in feed entries ranked by popularity'
15 FEED_NAME_FN = "www/www.decafbad.com/docs/private-feeds/popular-links.%s"
16 FEED_DIR = 'popular-links-feed'
17 FEEDS_FN = 'popular-feed-uris.txt'
19 MAX_ENTRY_AGE = 3 * 24 * 60 * 60
21 TITLE_TMPL = """Popular links @ %(time)s (%(link_cnt)s links)"""
22 TITLE_TIME_FMT = """%Y-%m-%d %H:%M"""
30 <div style="padding: 10px; margin: 10px; border: 1px solid #aaa;">
31 <a style="font-size:1.25em" href="%(link)s">%(link).80s</a><br />
32 <i style="font-size:0.75em">(%(link_cnt)s links)</i>
40 <a href="%(entry.link)s">%(entry.title)s</a>
42 <span style="font-size:0.75em">
43 [ <a href="%(feed.link)s">%(feed.title)s</a> ]
50 Scan all feeds and update the feed with a new link popularity
53 # Construct the feed generator.
54 f = LogBufferFeed(FEED_DIR)
55 f.MAX_AGE = 1 * 24 * 60 * 60 # 1 day
56 f.FEED_META['feed.title'] = FEED_TITLE
57 f.FEED_META['feed.tagline'] = FEED_TAGLINE
59 # Load up the list of feeds.
60 feed_uris = [ x.strip() for x in open(FEEDS_FN,'r').readlines() ]
62 # Skim for links from each feed, collect feed and entries in an
63 # inverted index using link URLs as top-level keys.
65 for feed_uri in feed_uris:
66 feed_data = feedparser.parse(feed_uri)
68 # Grab the feed metadata from parsed feed.
70 feed_link = feed.get('link', '#')
72 # Process all entries for their links...
73 for curr_entry in feed_data.entries:
75 # HACK: Ignore entries without modification dates.
76 # Maybe improve this by stashing seen dates in a DB.
77 if curr_entry.get('modified_parsed', None) is None:
80 # If the current entry is older than the max allowed age,
83 entry_time = calendar.timegm(curr_entry.modified_parsed)
84 if (now - entry_time) > MAX_ENTRY_AGE:
87 # Build a LinkSkimmer and feed it all summary and HTML
88 # content data from the current entry. Ignore parse
89 # errors in the interest of just grabbing what we can.
90 skimmer = LinkSkimmer()
92 skimmer.feed(curr_entry.get('summary',''))
93 for c in curr_entry.get('content', []):
95 except HTMLParseError:
98 # Process each link by adding the current feed and entry
99 # under the link's key in the inverted index.
100 for uri, cnt in skimmer.get_links():
101 if not links.has_key(uri):
103 if not links[uri].has_key(feed_link):
104 links[uri][feed_link] = (feed, curr_entry)
106 # Turn the inverted index of links into a list of tuples, sort by
107 # popularity of links as measured by number of linked entries.
108 links_sorted = links.items()
109 links_sorted.sort(lambda a,b: cmp(len(b[1].keys()), len(a[1].keys())))
111 # Build the overall entry content from all the links.
113 for x in links_sorted:
115 # Get the link and the list of linkers, skip this link if there
116 # aren't enough linkers counted.
118 if len(linkers) < MIN_LINKS: continue
120 # Build the list of linkers for this link by populating the
121 # LINKER_TMPL string template.
123 for feed, entry in linkers.values():
124 linkers_out.append(LINKER_TMPL % {
125 'feed.title' : feed.get('title', 'untitled'),
126 'feed.link' : feed.get('link', '#'),
127 'entry.title' : entry.get('title', 'untitled'),
128 'entry.link' : entry.get('link', '#'),
131 # Build the content block for this link by populating the
132 # LINK_TMPL string template.
133 links_out.append(LINK_TMPL % {
135 'link_cnt' : len(linkers),
136 'linkers' : '\n'.join(linkers_out)
139 # Complete building the content for this entry by populating the
140 # CONTENT_TMPL string template.
141 out = CONTENT_TMPL % '\n'.join(links_out)
143 # Construct and append a new entry
144 entry = FeedEntryDict({
145 'title' : TITLE_TMPL % {
146 'link_cnt' : len(links_out),
147 'time' : time.strftime(TITLE_TIME_FMT)
152 f.append_entry(entry)
154 # Output the current feed entries as both RSS and Atom
155 open(FEED_NAME_FN % 'rss', 'w').write(f.scrape_rss())
156 open(FEED_NAME_FN % 'atom', 'w').write(f.scrape_atom())
158 class LinkSkimmer(HTMLParser):
160 Quick and dirty link harvester.
163 """Reset the parser and the list of links."""
164 HTMLParser.reset(self)
168 """Return the links found as a list of tuples, link and count."""
169 return self.links.items()
171 def handle_starttag(self, tag, attrs_tup):
172 """Harvest href attributes from link tags"""
173 attrs = dict(attrs_tup)
174 if tag == "a" and attrs.has_key('href'):
175 self.links[attrs['href']] = self.links.get(attrs['href'], 0) + 1
177 if __name__=='__main__': main()