This Perl script scrapes The New York Times website.
#!/usr/bin/env perl use strict; use LWP::UserAgent; use HTTP::Cookies; use HTML::TreeBuilder 3; my $OUTPUT_FILE = 'nyt_top_stories.txt'; # User agent needs to accept cookies to access NYT my $cookie = 'nyt_cookie.lwp'; my $cookie_jar = HTTP::Cookies->new('file' => $cookie, 'autosave' => 1); my $content = get_html('http://global.nytimes.com/'); my $tree = HTML::TreeBuilder->new_from_content($content); # Stores homepage URLS my @urls; scan_nyt_tree($tree, 'http://global.nytimes.com/'); $tree->delete(); unlink $OUTPUT_FILE; # Scrape article from each URL foreach (@urls) { $content = get_html($_); # Replace all newline characters, needed for $rawtext extraction $content =~ s/\n//g; # Extracts headline, byline, dateline, and raw text my $headline; if ($content =~ m/<nyt_headline .*?>(.*?)< \/NYT_HEADLINE>/) { $headline = $1; } my $byline; if ($content =~ m/<nyt_byline .*?>.*?<a \shref.*?>(.*?)< \/a>/) { $byline = $1; } my $dateline; if ($content =~ m/class="dateline">.*?Published:\s+([\w\s,]+)< \//) { $dateline = $1; } my $rawtext; if ($content =~ m/<NYT_TEXT.*?>(.*)< \/NYT_TEXT>/) { $rawtext = $1; } # Parses article's text by extracting everything between <p> tags my $text; while ($rawtext =~ m/</p><p>(.*?)< \/p>/g) { $text .= "\n\n$1"; } $text =~ s/ +/ /g; # REPLACE MUTLIPLE SPACES WITH ONE $text =~ s/< .*?>//g; # REMOVE HTML TAGS $text =~ s/—/--/g; # REPLACE HTML EM-DASH CODE WITH 2 HYPHENS $text =~ s/’|‘/'/g; # REPLACE SMART APOSTROPHES WITH ' $text =~ s/“|”/"/g; # REPLACE SMART QUOTATIONS WITH " $text =~ s/ / /g; open(OUTPUT, ">>$OUTPUT_FILE") or die("Cannot open $OUTPUT_FILE\n"); print OUTPUT "$headline\n$byline\n$dateline$text\n\n\n"; close(OUTPUT); } # Stores a web page's HTML as string sub get_html { my $url = $_[0]; my $browser = LWP::UserAgent->new(); $browser->cookie_jar($cookie_jar); # $response declared out here to be accessible after while loop my $response; # Prevents inifinite loops my $redirect_limit = 5; my $x = 0; # Sends GET request, follows redirects until response code 200 received # Stores successful request URL my $responseCode = 0; while ($responseCode != 200 && $x < $redirect_limit) { $response = $browser->get($url); $responseCode = $response->code; print "$url\n"; #print "response code: $responseCode\n"; $url = $response->header('Location'); $x++; } return $response->content; } # Picks out URLs of top NYT articles sub scan_nyt_tree { my ($root, $docbase) = @_; foreach my $div ($root->find_by_tag_name('div')) { my $class = $div->attr('class') || next; if ($class eq 'story') { my @children = $div->content_list; for (my $i = 0; $i < = $#children; $i++) { if (ref $children[$i] and ($children[$i]->tag eq 'h2' || $children[$i]->tag eq 'h3' || $children[$i]->tag eq 'h5')) { my @grandchildren = $children[$i]->content_list; # Search sibling if 1st grandchild not <a> if (ref $grandchildren[0] and $grandchildren[0]->tag eq 'a') { push (@urls, URI->new_abs($grandchildren[0]->attr('href') || next, $docbase)); } } } } } return; }