#!/usr/bin/env python """ ch09_fcc_scraper.py Use RegexScraper to produce a feed from fcc.gov news """ import sys, time, shelve, md5, re from urlparse import urljoin from scraperlib import RegexScraper def main(): """ Given an argument of 'atom' or 'rss' on the command line, produce an Atom or RSS feed. """ scraper = FCCScraper() if len(sys.argv) > 1 and sys.argv[1] == 'rss': print scraper.scrape_rss() else: print scraper.scrape_atom() class FCCScraper(RegexScraper): """Use regexes to scrape FCC news headlines""" # Filename of state database STATE_FN = "fcc_scraper_state" # URL to the Library of Congress news page. SCRAPE_URL = "http://www.fcc.gov/headlines.html" # Base HREF for all links on the page BASE_HREF = SCRAPE_URL # Metadata for scraped feed FEED_META = { 'feed.title' : 'FCC News', 'feed.link' : SCRAPE_URL, 'feed.tagline' : 'News from the FCC', 'feed.author.name' : 'Federal Communications Commission', 'feed.author.email' : 'fccinfo@fcc.gov', 'feed.author.url' : 'http://www.fcc.gov/aboutus.html' } # Regex to extract news headline paragraphs ENTRY_RE = '
' + \
'(?P
' + \
'(?P