New York Times Perl Web Scraper

This Perl script scrapes The New York Times website.
#!/usr/bin/env perl

use strict;
use LWP::UserAgent;
use HTTP::Cookies;
use HTML::TreeBuilder 3;

my $OUTPUT_FILE = 'nyt_top_stories.txt';

# User agent needs to accept cookies to access NYT
my $cookie = 'nyt_cookie.lwp';
my $cookie_jar = HTTP::Cookies->new('file' => $cookie, 'autosave' => 1);

my $content = get_html('http://global.nytimes.com/');
my $tree = HTML::TreeBuilder->new_from_content($content);

# Stores homepage URLS
my @urls;
scan_nyt_tree($tree, 'http://global.nytimes.com/');
$tree->delete();

unlink $OUTPUT_FILE;

# Scrape article from each URL
foreach (@urls) {
    $content = get_html($_);
    # Replace all newline characters, needed for $rawtext extraction
    $content =~ s/\n//g;

    # Extracts headline, byline, dateline, and raw text
    my $headline;
    if ($content =~ m/<nyt_headline .*?>(.*?)< \/NYT_HEADLINE>/) {
        $headline = $1;
    }

    my $byline;
    if ($content =~ m/<nyt_byline .*?>.*?<a \shref.*?>(.*?)< \/a>/) {
        $byline = $1;
    }

    my $dateline;
    if ($content =~ m/class="dateline">.*?Published:\s+([\w\s,]+)< \//) {
        $dateline = $1;
    }

    my $rawtext;
    if ($content =~ m/<NYT_TEXT.*?>(.*)< \/NYT_TEXT>/) {
        $rawtext = $1;
    }

    # Parses article's text by extracting everything between <p> tags
    my $text;
    while ($rawtext =~ m/</p><p>(.*?)< \/p>/g) {
        $text .= "\n\n$1";
    }
    $text =~ s/ +/ /g;              # REPLACE MUTLIPLE SPACES WITH ONE
    $text =~ s/< .*?>//g;           # REMOVE HTML TAGS
    $text =~ s/&mdash;/--/g;        # REPLACE HTML EM-DASH CODE WITH 2 HYPHENS
    $text =~ s/'|&lsquo;/'/g; # REPLACE SMART APOSTROPHES WITH '
    $text =~ s/"|&rdquo;/"/g; # REPLACE SMART QUOTATIONS WITH "
    $text =~ s/&nbsp;/ /g;

    open(OUTPUT, ">>$OUTPUT_FILE") or die("Cannot open $OUTPUT_FILE\n");
    print OUTPUT "$headline\n$byline\n$dateline$text\n\n\n";
    close(OUTPUT);
}

# Stores a web page's HTML as string
sub get_html {
    my $url = $_[0];
    my $browser = LWP::UserAgent->new();
    $browser->cookie_jar($cookie_jar);

    # $response declared out here to be accessible after while loop
    my $response;
    # Prevents inifinite loops
    my $redirect_limit = 5;
    my $x = 0;

    # Sends GET request, follows redirects until response code 200 received
    # Stores successful request URL
    my $responseCode = 0;
    while ($responseCode != 200 && $x < $redirect_limit) {
        $response = $browser->get($url);
        $responseCode = $response->code;
        print "$url\n";
        #print "response code: $responseCode\n";
        $url = $response->header('Location');
        $x++;
    }
    return $response->content;
}

# Picks out URLs of top NYT articles
sub scan_nyt_tree {
   my ($root, $docbase) = @_;
   foreach my $div ($root->find_by_tag_name('div')) {
       my $class = $div->attr('class') || next;
       if ($class eq 'story') {
           my @children = $div->content_list;
           for (my $i = 0; $i < = $#children; $i++) {
               if (ref $children[$i] and
                   ($children[$i]->tag eq 'h2' ||
                   $children[$i]->tag eq 'h3' ||
                   $children[$i]->tag eq 'h5')) {
                   my @grandchildren = $children[$i]->content_list;
                       # Search sibling if 1st grandchild not <a>
                       if (ref $grandchildren[0] and $grandchildren[0]->tag eq 'a') {
                       push (@urls, URI->new_abs($grandchildren[0]->attr('href') || next, $docbase));
                   }

               }
           }
       }
   }
   return;
}