New York Times Python Web Scraper

This Python web scraper extracts top New York Times articles and outputs a UTF-8-encoded .txt file.

import os
import urllib2
import cookielib
import re
import htmlentitydefs
import codecs
import time
from BeautifulSoup import BeautifulSoup

URL_REQUEST_DELAY = 1
BASE = 'http://www.nytimes.com'
TXDATA = None
TXHEADERS = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
OUTPUT_FILE = 'nyt_top_stories.txt'

def request_url(url, txdata, txheaders):
    """Gets a webpage's HTML."""
    req = Request(url, txdata, txheaders)
    handle = urlopen(req)
    html = handle.read()
    return html

def remove_html_tags(data):
    """Removes HTML tags"""
    p = re.compile(r'< .*?>')
    return p.sub('', data)

def unescape(text):
    """
    Converts HTML character codes to Unicode code points.

    @param text the HTML (or XML) source text in any encoding.
    @return The plain text, as a Unicode string, if necessary.
    """
    def fixup(m):
        text = m.group(0)
        if text[:2] == "&#":
            try:
                if text[:3] == "&#x":
                    return unichr(int(text[3:-1], 16))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            try:
                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text
    return re.sub("&#?\w+;", fixup, text)


urlopen = urllib2.urlopen
Request = urllib2.Request

# Install cookie jar in opener for fetching URL
cookiejar = cookielib.LWPCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
urllib2.install_opener(opener)

html = request_url('http://global.nytimes.com/', TXDATA, TXHEADERS)
# Use BeautifulSoup to easily navigate HTML tree
soup = BeautifulSoup(html)

# Retrieves html from each url on NYT Global homepage under "story" divs
# with h2, h3, or h5 headlines
urls = []
for story in soup.findAll('div', {'class': 'story'}):
    for hTag in story.findAll({'h2': True, 'h3': True, 'h5': True},
                              recursive=False):
        if hTag.find('a'):
            urls.append(hTag.find('a')['href'])

# Removes URLs that aren't news articles.
# Create a copy of list b/c you can't modify a list while iterating over it.
for url in urls[:]:
    if not url.startswith(BASE):
        urls.remove(url)

# Extracts headline, byline, dateline and content; outputs to file
if os.path.exists(OUTPUT_FILE):
    os.remove(OUTPUT_FILE)
output = codecs.open(OUTPUT_FILE, 'a', 'utf-8')

for url in urls:
    content = ''
    html = request_url(url, TXDATA, TXHEADERS)
    html = unicode(html, 'utf-8')
    soup = BeautifulSoup(html)
    # Gets HTML from single page link if article is > 1 page
    if soup.find('li', {'class': 'singlePage'}):
        single = soup.find('li', {'class': 'singlePage'})
        html = request_url(BASE + single.find('a')['href'], TXDATA, TXHEADERS)
        html = unicode(html, 'utf-8')
        soup = BeautifulSoup(html)

    if not soup.find('nyt_headline'):
        continue
    headline = soup.find('nyt_headline').renderContents()
    print headline
    output.write(unicode(headline + "\n", 'utf-8'))

    byline = soup.find('nyt_byline').find('h6').renderContents()
    byline = remove_html_tags(byline)
    output.write(unicode(byline + "\n", 'utf-8'))

    dateline = soup.find('h6', {'class': 'dateline'}).renderContents()
    output.write(unicode(dateline, 'utf-8'))

    for p in soup.findAll('p', {'class': None, 'style': None}):
        # Removes potential ad at the bottom of the page.
        if p.findParents('div', {'class': 'singleAd'}):
            continue
        # Prevents contents of nested <p> tags from being printed twice.
        if p.findParents('div', {'class': 'authorIdentification'}):
            continue
        content = content + "\n\n" + p.renderContents().strip()
    content = remove_html_tags(content)
    content = re.sub(" +", " ", content)
    content = unescape(unicode(content, 'utf-8'))
    content = content + "\n\n\n\n"
    output.write(content)

    time.sleep(URL_REQUEST_DELAY)

output.close()

The second one counts for each article the number of sentences, words, characters, and calculates simple statistics like words per sentence and characters per word.

from __future__ import division
import os
import urllib2
import cookielib
import re
import codecs
import htmlentitydefs
import time
from BeautifulSoup import BeautifulSoup

URL_REQUEST_DELAY = 1
BASE = 'http://www.nytimes.com'
TXDATA = None
TXHEADERS = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
OUTPUT_FILE = 'nyt_stats.txt'
SENTENCE = re.compile(ur'[\'"\u201c(]{0,2}[A-Z]([^.?!]*[.?!][\'"\u201d)]{0,2})+?(?!\s+[a-z]|\d)', re.UNICODE)

# Dictionary to replace common abbreviations for correct sentence segmentation.
DICT1 = {
      "Mr." : "Mr",
      "Mrs." : "Mrs",
      "Ms." : "Ms",
      "Jan." : "Jan",
      "Feb." : "Feb",
      "Mar." : "Mar",
      "Apr." : "Apr",
      "Jun." : "Jun",
      "Jul." : "Jul",
      "Aug." : "Aug",
      "Sept." : "Sept",
      "Sep." : "Sep",
      "Oct." : "Oct",
      "Nov." : "Nov",
      "Dec." : "Dec",
      "Jr." : "Jr",
      "Brig." : "Brig",
      "Gen." : "Gen",
      "Maj." : "Maj",
      "a.m." : "AM",
      "p.m." : "PM",
      "Rev." : "Rev",
      "Fla." : "Fla",
      "Dr." : "Dr",
      "Gov." : "Gov",
}

# Dictionary to prepare for word tokenization.
DICT2 = {
      '.' : '',
      ',' : '',
      u'\u201c' : '',       # Left curly quotation mark
      u'\u201d' : '',       # Right curly quotation mark
      u'\u2014' : '',       # Em-dash
      '"' : '',
      ' - ' : ' ',
      '(' : '',
      ')' : '',
      ';' : '',
      ':' : '',
      '?' : '',
      '!' : '',
      '--' : ' ',
}

def request_url(url, txdata, txheaders):
    """Gets a webpage's HTML."""
    req = Request(url, txdata, txheaders)
    handle = urlopen(req)
    html = handle.read()
    return html

def remove_html_tags(data):
    """Removes HTML tags"""
    p = re.compile(r'< .*?>')
    return p.sub('', data)

def unescape(text):
    """
    Converts HTML character codes to Unicode code points.

    @param text the HTML (or XML) source text in any encoding.
    @return The plain text, as a Unicode string, if necessary.
    """
    def fixup(m):
        text = m.group(0)
        if text[:2] == "&#":
            try:
                if text[:3] == "&#x":
                    return unichr(int(text[3:-1], 16))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            try:
                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text
    return re.sub("&#?\w+;", fixup, text)

def multiple_replace(adict, text):
    """
    Replaces multiple patterns in a string in a single pass

    Creates a regular expression from all dictionary keys.
    For each match, replace with the corresponding dictionary value.
    """
    regex = re.compile("|".join(map(re.escape, adict.keys(  ))))
    return regex.sub(lambda match: adict[match.group(0)], text)


urlopen = urllib2.urlopen
Request = urllib2.Request

# Install cookie jar in opener for fetching URL
cookiejar = cookielib.LWPCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
urllib2.install_opener(opener)

html = request_url('http://global.nytimes.com/', TXDATA, TXHEADERS)
# Use BeautifulSoup to easily navigate HTML tree
soup = BeautifulSoup(html)

# Retrieves HTML from each URL on NYT Global homepage under "story" divs
# with h2, h3, or h5 headlines.
urls = []
for story in soup.findAll('div', {'class': 'story'}):
    for hTag in story.findAll({'h2': True, 'h3': True, 'h5': True},
                              recursive=False):
        if hTag.find('a'):
            urls.append(hTag.find('a')['href'])

# Removes URLs that aren't news articles.
# Create a copy of list b/c you can't modify a list while iterating over it.
for url in urls[:]:
    if not url.startswith(BASE):
        urls.remove(url)

# Extracts headline, segments sentences, and tokenizes words.
if os.path.exists(OUTPUT_FILE):
    os.remove(OUTPUT_FILE)
output = codecs.open(OUTPUT_FILE, 'a', 'utf-8')

for url in urls:
    html = request_url(url, TXDATA, TXHEADERS)
    html = unicode(html, 'utf-8')
    soup = BeautifulSoup(html)
    # Gets HTML from single page link if article is over one page.
    if soup.find('li', {'class': 'singlePage'}):
        single = soup.find('li', {'class': 'singlePage'})
        html = request_url(BASE + single.find('a')['href'], TXDATA, TXHEADERS)
        html = unicode(html, 'utf-8')
        soup = BeautifulSoup(html)

    if not soup.find('nyt_headline'):
        continue
    headline = soup.find('nyt_headline').renderContents()
    print headline
    output.write(unicode(headline + "\n", 'utf-8'))

    content = ''
    sents = []
    words = []

    for p in soup.findAll('p', {'class': None, 'style': None}):
        # Removes potential ad at the bottom of the page.
        if p.findParents('div', {'class': 'singleAd'}):
            continue
        # Prevents contents of nested <p> tags from being printed twice.
        if p.findParents('div', {'class': 'authorIdentification'}):
            continue
        content = content + " " + p.renderContents().strip()
    content = remove_html_tags(content)
    content = re.sub(" +", " ", content)
    # Converts text between </p><p> tags to unicode in case of utf-8 chars.
    content = unicode(content, 'utf-8')
    content = unescape(content)

    # Sentence segmentation
    content = multiple_replace(DICT1, content)
    # Removes . in abbreviations when . preceded by capital letter
    # and followed by capital letter, comma, apostrophe, or space
    # and not followed by a capital letter after that.
    content = re.sub(ur'(?< =[A-Z])\.(?=[A-Z,\u2019\'\s][^A-Z])', '', content)
    for m in re.finditer(SENTENCE, content):
        sents.append(m.group(0))
    output.write("# of sentences: %d\n" % len(sents))

    # Word tokenization
    words = re.split("\s+", multiple_replace(DICT2, content.strip()))
    output.write("# of words: %d\n" % len(words))

    # Counts words in first sentence.
    sent1_len = len(re.split("\s+", multiple_replace(DICT2, sents[0])))
    output.write("# of words in 1st sentence: %d\n" % sent1_len)

    chars = 0
    for word in words:
        for char in word:
            chars += 1
    output.write("# of characters: %d\n" % chars)

    output.write("avg # of words/sentence: %.2f\n"
                 % (len(words) / len(sents)))
    output.write("avg # of characters/word: %.2f\n"
                 % (chars / len(words)))

    # Calculates lexical richness
    words_lower = []
    for word in words[:]:
        words_lower.append(word.lower())
    output.write("lexical richness: %.2f\n\n"
                 % (len(set(words_lower)) / len(words)))

    time.sleep(URL_REQUEST_DELAY)

output.close()

I would like to add that this regular expression in my program for sentence segmentation took me three days to construct:

['"u201c(]{0,2}[A-Z]([^.?!]*[.?!]['"u201d)]{0,2}) ?(?!s [a-z]|d)

It correctly identifies sentences most of the time if the input text is properly prepared beforehand. But this regex will segment a sentence that includes a name like “John O. Brennan” into two.