#!/usr/bin/env python
"""
FeedAutodiscoveryParser.py
This module implements a simple feed autodiscovery technique using
HTMLParser from the standard Python library.
"""
import sys
from urllib2 import urlopen
from urlparse import urljoin
from HTMLParser import HTMLParser, HTMLParseError
class FeedAutodiscoveryParser(HTMLParser):
"""
This class extracts feed candidate links from HTML.
"""
# These are the MIME types of links accepted as feeds
FEED_TYPES = ('application/rss+xml',
'text/xml',
'application/atom+xml',
'application/x.atom+xml',
'application/x-atom+xml')
def __init__(self, base_href):
"""
Initialize the parser
"""
HTMLParser.__init__(self)
self.base_href = base_href
self.feeds = []
def handle_starttag(self, tag, attrs_tup):
"""
While parsing HTML, watch out for and tags.
Accumulate any feed-candidate links found.
"""
# Turn the tag name to lowercase for easier comparison, and
# make a dict with lowercase keys for the tag attributes
tag = tag.lower()
attrs = dict([(k.lower(), v) for k,v in attrs_tup])
# If we find a tag with new HREF, change the current base HREF
if tag == "base" and 'href' in attrs:
self.base_href = attrs['href']
# If we find a tag, check it for feed candidacy.
if tag == "link":
# Extract the standard link attributes
rel = attrs.get("rel", "")
type = attrs.get("type", "")
title = attrs.get("title", "")
href = attrs.get("href", "")
# Check if this link is a feed candidate, add to the list if so.
if rel == "alternate" and type in self.FEED_TYPES:
self.feeds.append({
'type' : type,
'title' : title,
'href' : href
})
def getFeedsDetail(url):
"""
Load up the given URL, parse, and return any feeds found.
"""
data = urlopen(url).read()
parser = FeedAutodiscoveryParser(url)
try:
parser.feed(data)
except HTMLParseError:
# Ignore any parse errors, since HTML is dirty and what we want
# should be early on in the document anyway.
pass
# Fix up feed HREFs, converting to absolute URLs using the base HREF.
for feed in parser.feeds:
feed['href'] = urljoin(parser.base_href, feed['href'])
return parser.feeds
def getFeeds(url):
return [ x['href'] for x in getFeedsDetail(url) ]
def main():
url = sys.argv[1]
feeds = getFeedsDetail(url)
print
print "Found the following possible feeds at %s:" % url
for feed in feeds:
print "\t '%(title)s' of type %(type)s at %(href)s" % feed
print
if __name__ == "__main__": main()