New York Times Perl Web Scraper

|

This Perl script scrapes The New York Times website.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env perl

use strict;
use LWP::UserAgent;
use HTTP::Cookies;
use HTML::TreeBuilder 3;

my $OUTPUT_FILE = 'nyt_top_stories.txt';

# User agent needs to accept cookies to access NYT
my $cookie = 'nyt_cookie.lwp';
my $cookie_jar = HTTP::Cookies->new('file' => $cookie, 'autosave' => 1);

my $content = get_html('http://global.nytimes.com/');
my $tree = HTML::TreeBuilder->new_from_content($content);

# Stores homepage URLS
my @urls;
scan_nyt_tree($tree, 'http://global.nytimes.com/');
$tree->delete();

unlink $OUTPUT_FILE;

# Scrape article from each URL
foreach (@urls) {
    $content = get_html($_);
    # Replace all newline characters, needed for $rawtext extraction
    $content =~ s/\n//g;

    # Extracts headline, byline, dateline, and raw text
    my $headline;
    if ($content =~ m/<nyt_headline .*?>(.*?)< \/NYT_HEADLINE>/) {
        $headline = $1;
    }

    my $byline;
    if ($content =~ m/<nyt_byline .*?>.*?<a \shref.*?>(.*?)< \/a>/) {
        $byline = $1;
    }

    my $dateline;
    if ($content =~ m/class="dateline">.*?Published:\s+([\w\s,]+)< \//) {
        $dateline = $1;
    }

    my $rawtext;
    if ($content =~ m/<NYT_TEXT.*?>(.*)< \/NYT_TEXT>/) {
        $rawtext = $1;
    }

    # Parses article's text by extracting everything between <p> tags
    my $text;
    while ($rawtext =~ m/</p><p>(.*?)< \/p>/g) {
        $text .= "\n\n$1";
    }
    $text =~ s/ +/ /g;              # REPLACE MUTLIPLE SPACES WITH ONE
    $text =~ s/< .*?>//g;           # REMOVE HTML TAGS
    $text =~ s/&mdash;/--/g;        # REPLACE HTML EM-DASH CODE WITH 2 HYPHENS
    $text =~ s/'|&lsquo;/'/g; # REPLACE SMART APOSTROPHES WITH '
    $text =~ s/"|&rdquo;/"/g; # REPLACE SMART QUOTATIONS WITH "
    $text =~ s/&nbsp;/ /g;

    open(OUTPUT, ">>$OUTPUT_FILE") or die("Cannot open $OUTPUT_FILE\n");
    print OUTPUT "$headline\n$byline\n$dateline$text\n\n\n";
    close(OUTPUT);
}

# Stores a web page's HTML as string
sub get_html {
    my $url = $_[0];
    my $browser = LWP::UserAgent->new();
    $browser->cookie_jar($cookie_jar);

    # $response declared out here to be accessible after while loop
    my $response;
    # Prevents inifinite loops
    my $redirect_limit = 5;
    my $x = 0;

    # Sends GET request, follows redirects until response code 200 received
    # Stores successful request URL
    my $responseCode = 0;
    while ($responseCode != 200 && $x < $redirect_limit) {
        $response = $browser->get($url);
        $responseCode = $response->code;
        print "$url\n";
        #print "response code: $responseCode\n";
        $url = $response->header('Location');
        $x++;
    }
    return $response->content;
}

# Picks out URLs of top NYT articles
sub scan_nyt_tree {
   my ($root, $docbase) = @_;
   foreach my $div ($root->find_by_tag_name('div')) {
       my $class = $div->attr('class') || next;
       if ($class eq 'story') {
           my @children = $div->content_list;
           for (my $i = 0; $i < = $#children; $i++) {
               if (ref $children[$i] and
                   ($children[$i]->tag eq 'h2' ||
                   $children[$i]->tag eq 'h3' ||
                   $children[$i]->tag eq 'h5')) {
                   my @grandchildren = $children[$i]->content_list;
                       # Search sibling if 1st grandchild not <a>
                       if (ref $grandchildren[0] and $grandchildren[0]->tag eq 'a') {
                       push (@urls, URI->new_abs($grandchildren[0]->attr('href') || next, $docbase));
                   }

               }
           }
       }
   }
   return;
}