Economist Python Web Scraper

Here’s a Python web scraper that gives you the full print edition of The Economist for free. Or you could go to their print edition that’s online and reject their cookies.
import os
import urllib2
import cookielib
import re
import htmlentitydefs
import codecs
import time
from BeautifulSoup import BeautifulSoup

URL_REQUEST_DELAY = 1
BASE = 'http://www.economist.com'
TXDATA = None
TXHEADERS = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
OUTPUT_FILE = 'economist.txt'

def request_url(url, txdata, txheaders):
    """Gets a webpage's HTML"""
    req = Request(url, txdata, txheaders)
    handle = urlopen(req)
    html = handle.read()
    return html

def remove_html_tags(data):
    """Removes HTML tags"""
    p = re.compile(r'< .*?>')
    return p.sub('', data)

# Converts HTML character codes to Unicode code points
def unescape(text):
    """
    Converts HTML character codes to Unicode code points.

    @param text the HTML (or XML) source text in any encoding.
    @return The plain text, as a Unicode string, if necessary.
    """
    def fixup(m):
        text = m.group(0)
        if text[:2] == "&#":
            try:
                if text[:3] == "&#x":
                    return unichr(int(text[3:-1], 16))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            try:
                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text
    return re.sub("&#?\w+;", fixup, text)


urlopen = urllib2.urlopen
Request = urllib2.Request
html = request_url('http://www.economist.com/printedition/', TXDATA, TXHEADERS)
# Use BeautifulSoup to easily navigate HTML tree
soup = BeautifulSoup(html)

# Retrieves HTML from URLs on Economist's homepage with "block" divs
urls = []
for block in soup.findAll('div', {'class': 'block'}):
    for hTag in block.findAll({'h2' : True}):
        if hTag.find('a'):
            urls.append(hTag.find('a')['href'])

# Appends URL if it's a relative link.
# Create a copy of list b/c you can't modify a list while iterating over it.
for i, url in enumerate(urls[:]):
    if not url.startswith('http'):
        urls[i] = BASE + url

# Extracts headline, byline, dateline and content; outputs to file
if os.path.exists(OUTPUT_FILE):
    os.remove(OUTPUT_FILE)
output = codecs.open(OUTPUT_FILE, 'a', 'utf-8')

for url in urls:
    content = ''
    html = request_url(url, TXDATA, TXHEADERS)
    html = unicode(html, 'utf-8', errors='ignore')
    soup = BeautifulSoup(html)

    if soup.find('div', {'id' : 'ec-article-body'}):
        body = soup.find('div', {'id' : 'ec-article-body'})

        # Get headlines
        if body.find('h1'):
            h_one = body.find('h1').renderContents().strip()
            if not h_one == '':
                output.write(unicode(h_one + "\n", 'utf-8'))
        if body.find('div', {'class' : 'headline'}):
            headline = unescape(unicode(body.find('div', {'class' : 'headline'})\
                                .renderContents(), 'utf-8')).strip()
            if not headline == '':
                print headline
                output.write(headline + u'\n')
        if body.find('h2', {'class' : 'rubric'}):
            h_two = body.find('h2', {'class' : 'rubric'})\
                    .renderContents().strip()
            if not h_two == '':
                output.write(unicode(h_two + "\n", 'utf-8'))

        # Get date and location
        if body.find('p', {'class' : 'ec-article-info'}):
            article_info = body.find('p', {'class' : 'ec-article-info'})\
                           .renderContents().strip()
            article_info = remove_html_tags(article_info)
            article_info = re.sub(" +", " ", article_info)
            output.write(unicode(article_info + "\n\n", 'utf-8'))

        # Get content
        for p in body.findAll('p', {'class': None}):
            content = content + p.renderContents().strip() + "\n\n"
        content = remove_html_tags(content)
        content = re.sub(" +", " ", content)
        content = unicode(content, 'utf-8')
        content = unescape(content)
        content = content + "\n"
        output.write(content)

    time.sleep(URL_REQUEST_DELAY)

output.close()