David Xia

New York Times Perl Web Scraper

This Perl script scrapes The New York Times website.

 
#!/usr/bin/env perl
 
use strict;
use LWP::UserAgent;
use HTTP::Cookies;
use HTML::TreeBuilder 3;
 
my $OUTPUT_FILE = 'nyt_top_stories.txt';
 
# User agent needs to accept cookies to access NYT
my $cookie = 'nyt_cookie.lwp';
my $cookie_jar = HTTP::Cookies->new('file' => $cookie, 'autosave' => 1);
 
my $content = get_html('http://global.nytimes.com/');
my $tree = HTML::TreeBuilder->new_from_content($content);
 
# Stores homepage URLS
my @urls;
scan_nyt_tree($tree, 'http://global.nytimes.com/');
$tree->delete();
 
unlink $OUTPUT_FILE;
 
# Scrape article from each URL
foreach (@urls) {
    $content = get_html($_);
    # Replace all newline characters, needed for $rawtext extraction
    $content =~ s/\n//g;
 
    # Extracts headline, byline, dateline, and raw text
    my $headline;
    if ($content =~ m/<nyt_headline .*?>(.*?)< \/NYT_HEADLINE>/) {
        $headline = $1;
    }
 
    my $byline;
    if ($content =~ m/<nyt_byline .*?>.*?<a \shref.*?>(.*?)< \/a>/) {
        $byline = $1;
    }
 
    my $dateline;
    if ($content =~ m/class="dateline">.*?Published:\s+([\w\s,]+)< \//) {
        $dateline = $1;
    }
 
    my $rawtext;
    if ($content =~ m/<NYT_TEXT.*?>(.*)< \/NYT_TEXT>/) {
        $rawtext = $1;
    }
 
    # Parses article's text by extracting everything between <p> tags
    my $text;
    while ($rawtext =~ m/</p><p>(.*?)< \/p>/g) {
        $text .= "\n\n$1";
    }
    $text =~ s/ +/ /g;              # REPLACE MUTLIPLE SPACES WITH ONE
    $text =~ s/< .*?>//g;            # REMOVE HTML TAGS
    $text =~ s/&mdash;/--/g;        # REPLACE HTML EM-DASH CODE WITH 2 HYPHENS
    $text =~ s/&rsquo;|&lsquo;/'/g; # REPLACE SMART APOSTROPHES WITH '
    $text =~ s/&ldquo;|&rdquo;/"/g; # REPLACE SMART QUOTATIONS WITH "
    $text =~ s/&nbsp;/ /g;
 
    open(OUTPUT, ">>$OUTPUT_FILE") or die("Cannot open $OUTPUT_FILE\n");
    print OUTPUT "$headline\n$byline\n$dateline$text\n\n\n";
    close(OUTPUT);
}
 
# Stores a web page's HTML as string
sub get_html {
    my $url = $_[0];
    my $browser = LWP::UserAgent->new();
    $browser->cookie_jar($cookie_jar);
 
    # $response declared out here to be accessible after while loop
    my $response;
    # Prevents inifinite loops
    my $redirect_limit = 5;
    my $x = 0;
 
    # Sends GET request, follows redirects until response code 200 received
    # Stores successful request URL
    my $responseCode = 0;
    while ($responseCode != 200 && $x < $redirect_limit) {
        $response = $browser->get($url);
        $responseCode = $response->code;
        print "$url\n";
        #print "response code: $responseCode\n";
        $url = $response->header('Location');
        $x++;
    }
    return $response->content;
}
 
# Picks out URLs of top NYT articles
sub scan_nyt_tree {
   my ($root, $docbase) = @_;
   foreach my $div ($root->find_by_tag_name('div')) {
       my $class = $div->attr('class') || next;
       if ($class eq 'story') {
           my @children = $div->content_list;
           for (my $i = 0; $i < = $#children; $i++) {
               if (ref $children[$i] and
                   ($children[$i]->tag eq 'h2' ||
                   $children[$i]->tag eq 'h3' ||
                   $children[$i]->tag eq 'h5')) {
                   my @grandchildren = $children[$i]->content_list;
                       # Search sibling if 1st grandchild not <a>
                       if (ref $grandchildren[0] and $grandchildren[0]->tag eq 'a') {
                       push (@urls, URI->new_abs($grandchildren[0]->attr('href') || next, $docbase));
                   }
 
               }
           }
       }
   }
   return;
}
Post a comment or leave a trackback URL.

Post a Comment

Your email is never published nor shared. Required fields are marked *

*
*

You may use these HTML tags and attributes: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong> <pre lang="" line="" escaped="" highlight="">