Economist Python Web Scraper

|

Here’s a Python web scraper that gives you the full print edition of The Economist for free. Or you could go to their print edition that’s online and reject their cookies.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import os
import urllib2
import cookielib
import re
import htmlentitydefs
import codecs
import time
from BeautifulSoup import BeautifulSoup

URL_REQUEST_DELAY = 1
BASE = 'http://www.economist.com'
TXDATA = None
TXHEADERS = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
OUTPUT_FILE = 'economist.txt'

def request_url(url, txdata, txheaders):
    """Gets a webpage's HTML"""
    req = Request(url, txdata, txheaders)
    handle = urlopen(req)
    html = handle.read()
    return html

def remove_html_tags(data):
    """Removes HTML tags"""
    p = re.compile(r'< .*?>')
    return p.sub('', data)

# Converts HTML character codes to Unicode code points
def unescape(text):
    """
    Converts HTML character codes to Unicode code points.

    @param text the HTML (or XML) source text in any encoding.
    @return The plain text, as a Unicode string, if necessary.
    """
    def fixup(m):
        text = m.group(0)
        if text[:2] == "&#":
            try:
                if text[:3] == "&#x":
                    return unichr(int(text[3:-1], 16))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            try:
                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text
    return re.sub("&#?\w+;", fixup, text)


urlopen = urllib2.urlopen
Request = urllib2.Request
html = request_url('http://www.economist.com/printedition/', TXDATA, TXHEADERS)
# Use BeautifulSoup to easily navigate HTML tree
soup = BeautifulSoup(html)

# Retrieves HTML from URLs on Economist's homepage with "block" divs
urls = []
for block in soup.findAll('div', {'class': 'block'}):
    for hTag in block.findAll({'h2' : True}):
        if hTag.find('a'):
            urls.append(hTag.find('a')['href'])

# Appends URL if it's a relative link.
# Create a copy of list b/c you can't modify a list while iterating over it.
for i, url in enumerate(urls[:]):
    if not url.startswith('http'):
        urls[i] = BASE + url

# Extracts headline, byline, dateline and content; outputs to file
if os.path.exists(OUTPUT_FILE):
    os.remove(OUTPUT_FILE)
output = codecs.open(OUTPUT_FILE, 'a', 'utf-8')

for url in urls:
    content = ''
    html = request_url(url, TXDATA, TXHEADERS)
    html = unicode(html, 'utf-8', errors='ignore')
    soup = BeautifulSoup(html)

    if soup.find('div', {'id' : 'ec-article-body'}):
        body = soup.find('div', {'id' : 'ec-article-body'})

        # Get headlines
        if body.find('h1'):
            h_one = body.find('h1').renderContents().strip()
            if not h_one == '':
                output.write(unicode(h_one + "\n", 'utf-8'))
        if body.find('div', {'class' : 'headline'}):
            headline = unescape(unicode(body.find('div', {'class' : 'headline'})\
                                .renderContents(), 'utf-8')).strip()
            if not headline == '':
                print headline
                output.write(headline + u'\n')
        if body.find('h2', {'class' : 'rubric'}):
            h_two = body.find('h2', {'class' : 'rubric'})\
                    .renderContents().strip()
            if not h_two == '':
                output.write(unicode(h_two + "\n", 'utf-8'))

        # Get date and location
        if body.find('p', {'class' : 'ec-article-info'}):
            article_info = body.find('p', {'class' : 'ec-article-info'})\
                           .renderContents().strip()
            article_info = remove_html_tags(article_info)
            article_info = re.sub(" +", " ", article_info)
            output.write(unicode(article_info + "\n\n", 'utf-8'))

        # Get content
        for p in body.findAll('p', {'class': None}):
            content = content + p.renderContents().strip() + "\n\n"
        content = remove_html_tags(content)
        content = re.sub(" +", " ", content)
        content = unicode(content, 'utf-8')
        content = unescape(content)
        content = content + "\n"
        output.write(content)

    time.sleep(URL_REQUEST_DELAY)

output.close()