#!/usr/bin/env python
"""
ch09_loc_scraper.py
Use HTMLScraper to produce a feed from loc.gov news
"""
import sys, time, shelve
from urlparse import urljoin
from scraperlib import HTMLScraper
def main():
"""
Given an argument of 'atom' or 'rss' on the command line,
produce an Atom or RSS feed from the loc.gov news page.
"""
scraper = LOCScraper()
if len(sys.argv) > 1 and sys.argv[1] == 'rss':
print scraper.scrape_rss()
else:
print scraper.scrape_atom()
class LOCScraper(HTMLScraper):
"""
Parses HTML to extract the page title and description.
"""
# Filename of state database
STATE_FN = "loc_scraper_state"
# URL to the Library of Congress news page.
SCRAPE_URL = "http://www.loc.gov/today/pr/"
# Base HREF for all links on the page
BASE_HREF = SCRAPE_URL
# Metadata for scraped feed
FEED_META = {
'feed.title' : 'News from The Library of Congress',
'feed.link' : SCRAPE_URL,
'feed.tagline' : 'Press releases scraped from loc.gov',
'feed.author.name' : 'Library of Congress',
'feed.author.email' : 'pao@loc.gov',
'feed.author.url' : SCRAPE_URL,
}
def reset(self):
"""Reset scraper before next run."""
HTMLScraper.reset(self)
def handle_comment(self, data):
"""Look for HTML comments that mark start & end of extraction"""
if 'START PR LIST' in data: self.start_feed()
if 'END BODY TABLE' in data: self.end_feed()
def handle_starttag(self, tag, attrs_tup):
"""Handle start tags."""
attrs = dict(attrs_tup)
# Use