This Python web scraper extracts top New York Times articles and outputs a UTF-8-encoded .txt file.
import os import urllib2 import cookielib import re import htmlentitydefs import codecs import time from BeautifulSoup import BeautifulSoup URL_REQUEST_DELAY = 1 BASE = 'http://www.nytimes.com' TXDATA = None TXHEADERS = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} OUTPUT_FILE = 'nyt_top_stories.txt' def request_url(url, txdata, txheaders): """Gets a webpage's HTML.""" req = Request(url, txdata, txheaders) handle = urlopen(req) html = handle.read() return html def remove_html_tags(data): """Removes HTML tags""" p = re.compile(r'< .*?>') return p.sub('', data) def unescape(text): """ Converts HTML character codes to Unicode code points. @param text the HTML (or XML) source text in any encoding. @return The plain text, as a Unicode string, if necessary. """ def fixup(m): text = m.group(0) if text[:2] == "&#": try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else: try: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError: pass return text return re.sub("&#?\w+;", fixup, text) urlopen = urllib2.urlopen Request = urllib2.Request # Install cookie jar in opener for fetching URL cookiejar = cookielib.LWPCookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar)) urllib2.install_opener(opener) html = request_url('http://global.nytimes.com/', TXDATA, TXHEADERS) # Use BeautifulSoup to easily navigate HTML tree soup = BeautifulSoup(html) # Retrieves html from each url on NYT Global homepage under "story" divs # with h2, h3, or h5 headlines urls = [] for story in soup.findAll('div', {'class': 'story'}): for hTag in story.findAll({'h2': True, 'h3': True, 'h5': True}, recursive=False): if hTag.find('a'): urls.append(hTag.find('a')['href']) # Removes URLs that aren't news articles. # Create a copy of list b/c you can't modify a list while iterating over it. for url in urls[:]: if not url.startswith(BASE): urls.remove(url) # Extracts headline, byline, dateline and content; outputs to file if os.path.exists(OUTPUT_FILE): os.remove(OUTPUT_FILE) output = codecs.open(OUTPUT_FILE, 'a', 'utf-8') for url in urls: content = '' html = request_url(url, TXDATA, TXHEADERS) html = unicode(html, 'utf-8') soup = BeautifulSoup(html) # Gets HTML from single page link if article is > 1 page if soup.find('li', {'class': 'singlePage'}): single = soup.find('li', {'class': 'singlePage'}) html = request_url(BASE + single.find('a')['href'], TXDATA, TXHEADERS) html = unicode(html, 'utf-8') soup = BeautifulSoup(html) if not soup.find('nyt_headline'): continue headline = soup.find('nyt_headline').renderContents() print headline output.write(unicode(headline + "\n", 'utf-8')) byline = soup.find('nyt_byline').find('h6').renderContents() byline = remove_html_tags(byline) output.write(unicode(byline + "\n", 'utf-8')) dateline = soup.find('h6', {'class': 'dateline'}).renderContents() output.write(unicode(dateline, 'utf-8')) for p in soup.findAll('p', {'class': None, 'style': None}): # Removes potential ad at the bottom of the page. if p.findParents('div', {'class': 'singleAd'}): continue # Prevents contents of nested <p> tags from being printed twice. if p.findParents('div', {'class': 'authorIdentification'}): continue content = content + "\n\n" + p.renderContents().strip() content = remove_html_tags(content) content = re.sub(" +", " ", content) content = unescape(unicode(content, 'utf-8')) content = content + "\n\n\n\n" output.write(content) time.sleep(URL_REQUEST_DELAY) output.close() </p>
The second one counts for each article the number of sentences, words, characters, and calculates simple statistics like words per sentence and characters per word.
from __future__ import division import os import urllib2 import cookielib import re import codecs import htmlentitydefs import time from BeautifulSoup import BeautifulSoup URL_REQUEST_DELAY = 1 BASE = 'http://www.nytimes.com' TXDATA = None TXHEADERS = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} OUTPUT_FILE = 'nyt_stats.txt' SENTENCE = re.compile(ur'[\'"\u201c(]{0,2}[A-Z]([^.?!]*[.?!][\'"\u201d)]{0,2})+?(?!\s+[a-z]|\d)', re.UNICODE) # Dictionary to replace common abbreviations for correct sentence segmentation. DICT1 = { "Mr." : "Mr", "Mrs." : "Mrs", "Ms." : "Ms", "Jan." : "Jan", "Feb." : "Feb", "Mar." : "Mar", "Apr." : "Apr", "Jun." : "Jun", "Jul." : "Jul", "Aug." : "Aug", "Sept." : "Sept", "Sep." : "Sep", "Oct." : "Oct", "Nov." : "Nov", "Dec." : "Dec", "Jr." : "Jr", "Brig." : "Brig", "Gen." : "Gen", "Maj." : "Maj", "a.m." : "AM", "p.m." : "PM", "Rev." : "Rev", "Fla." : "Fla", "Dr." : "Dr", "Gov." : "Gov", } # Dictionary to prepare for word tokenization. DICT2 = { '.' : '', ',' : '', u'\u201c' : '', # Left curly quotation mark u'\u201d' : '', # Right curly quotation mark u'\u2014' : '', # Em-dash '"' : '', ' - ' : ' ', '(' : '', ')' : '', ';' : '', ':' : '', '?' : '', '!' : '', '--' : ' ', } def request_url(url, txdata, txheaders): """Gets a webpage's HTML.""" req = Request(url, txdata, txheaders) handle = urlopen(req) html = handle.read() return html def remove_html_tags(data): """Removes HTML tags""" p = re.compile(r'< .*?>') return p.sub('', data) def unescape(text): """ Converts HTML character codes to Unicode code points. @param text the HTML (or XML) source text in any encoding. @return The plain text, as a Unicode string, if necessary. """ def fixup(m): text = m.group(0) if text[:2] == "&#": try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else: try: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError: pass return text return re.sub("&#?\w+;", fixup, text) def multiple_replace(adict, text): """ Replaces multiple patterns in a string in a single pass Creates a regular expression from all dictionary keys. For each match, replace with the corresponding dictionary value. """ regex = re.compile("|".join(map(re.escape, adict.keys( )))) return regex.sub(lambda match: adict[match.group(0)], text) urlopen = urllib2.urlopen Request = urllib2.Request # Install cookie jar in opener for fetching URL cookiejar = cookielib.LWPCookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar)) urllib2.install_opener(opener) html = request_url('http://global.nytimes.com/', TXDATA, TXHEADERS) # Use BeautifulSoup to easily navigate HTML tree soup = BeautifulSoup(html) # Retrieves HTML from each URL on NYT Global homepage under "story" divs # with h2, h3, or h5 headlines. urls = [] for story in soup.findAll('div', {'class': 'story'}): for hTag in story.findAll({'h2': True, 'h3': True, 'h5': True}, recursive=False): if hTag.find('a'): urls.append(hTag.find('a')['href']) # Removes URLs that aren't news articles. # Create a copy of list b/c you can't modify a list while iterating over it. for url in urls[:]: if not url.startswith(BASE): urls.remove(url) # Extracts headline, segments sentences, and tokenizes words. if os.path.exists(OUTPUT_FILE): os.remove(OUTPUT_FILE) output = codecs.open(OUTPUT_FILE, 'a', 'utf-8') for url in urls: html = request_url(url, TXDATA, TXHEADERS) html = unicode(html, 'utf-8') soup = BeautifulSoup(html) # Gets HTML from single page link if article is over one page. if soup.find('li', {'class': 'singlePage'}): single = soup.find('li', {'class': 'singlePage'}) html = request_url(BASE + single.find('a')['href'], TXDATA, TXHEADERS) html = unicode(html, 'utf-8') soup = BeautifulSoup(html) if not soup.find('nyt_headline'): continue headline = soup.find('nyt_headline').renderContents() print headline output.write(unicode(headline + "\n", 'utf-8')) content = '' sents = [] words = [] for p in soup.findAll('p', {'class': None, 'style': None}): # Removes potential ad at the bottom of the page. if p.findParents('div', {'class': 'singleAd'}): continue # Prevents contents of nested <p> tags from being printed twice. if p.findParents('div', {'class': 'authorIdentification'}): continue content = content + " " + p.renderContents().strip() content = remove_html_tags(content) content = re.sub(" +", " ", content) # Converts text between </p><p> tags to unicode in case of utf-8 chars. content = unicode(content, 'utf-8') content = unescape(content) # Sentence segmentation content = multiple_replace(DICT1, content) # Removes . in abbreviations when . preceded by capital letter # and followed by capital letter, comma, apostrophe, or space # and not followed by a capital letter after that. content = re.sub(ur'(?< =[A-Z])\.(?=[A-Z,\u2019\'\s][^A-Z])', '', content) for m in re.finditer(SENTENCE, content): sents.append(m.group(0)) output.write("# of sentences: %d\n" % len(sents)) # Word tokenization words = re.split("\s+", multiple_replace(DICT2, content.strip())) output.write("# of words: %d\n" % len(words)) # Counts words in first sentence. sent1_len = len(re.split("\s+", multiple_replace(DICT2, sents[0]))) output.write("# of words in 1st sentence: %d\n" % sent1_len) chars = 0 for word in words: for char in word: chars += 1 output.write("# of characters: %d\n" % chars) output.write("avg # of words/sentence: %.2f\n" % (len(words) / len(sents))) output.write("avg # of characters/word: %.2f\n" % (chars / len(words))) # Calculates lexical richness words_lower = [] for word in words[:]: words_lower.append(word.lower()) output.write("lexical richness: %.2f\n\n" % (len(set(words_lower)) / len(words))) time.sleep(URL_REQUEST_DELAY) output.close()
I would like to add that this regular expression in my program for sentence segmentation took me three days to construct:
['"u201c(]{0,2}[A-Z]([^.?!]*[.?!]['"u201d)]{0,2})+?(?!s+[a-z]|d)
It correctly identifies sentences most of the time if the input text is properly prepared beforehand. But this regex will segment a sentence that includes a name like “John O. Brennan” into two.
One Comment
That does indeed look pretty impressive. I don’t know any Python, just Java, and we don’t ever run into anything that looks that much like random garbage (mostly a bunch of annoyingly long function names). Maybe I’m lost, but why would you want a NYTimes web scraper anyway?
2 Trackbacks
[...] smart quotation marks, em-dashes, and any letter with accent marks correctly. My updated script is here. tweetfacebookCategories: Programming & SoftwarePermalink Post a comment or leave a trackback [...]
[...] Perl and Python regular expressions, a way of matching text, I still favor Perl’s. But my Python translation of my Perl script that web scrapes the New York Times is only half the length and more [...]