Parcourir la source

more of toms scrapers

import/raw
memespring il y a 17 ans
Parent
révision
54a24331cc
3 fichiers modifiés avec 350 ajouts et 0 suppressions
  1. +108
    -0
      trunk/cgi-bin/Dacorum.cgi
  2. +122
    -0
      trunk/cgi-bin/EastHerts.cgi
  3. +120
    -0
      trunk/cgi-bin/Enfield.cgi

+ 108
- 0
trunk/cgi-bin/Dacorum.cgi Voir le fichier

@@ -0,0 +1,108 @@
#!/usr/bin/perl

use strict;
use warnings;

use CGI qw(:cgi);
use HTML::TreeBuilder;
use LWP::UserAgent;
use XML::Writer;

# The master URLs for the Dacorum planning search
our $SearchURL = "http://www.dacorum.gov.uk/default.aspx?page=1495";
our $InfoURL = "http://www.dacorum.gov.uk/Default.aspx?page=1497&ID=";
our $CommentURL = "http://www.dacorum.gov.uk/Default.aspx?page=2847&ID=";

# We're a CGI script...
my $query = CGI->new();

# Construct an LWP user agent
our $UA = LWP::UserAgent->new(env_proxy => 1,
cookie_jar => {},
requests_redirectable => [ 'GET', 'HEAD', 'POST' ]);

# Post the URL to get an initial blank form
my $state = get_state(do_post());

# Do the search
my $page = do_post({"__VIEWSTATE" => $state,
"Template:_ctl10:_ctl0:btnSearch" => "Search",
"Template:_ctl10:_ctl0:tbRegistrationFromDay" => $query->param("day"),
"Template:_ctl10:_ctl0:tbRegistrationFromMon" => $query->param("month"),
"Template:_ctl10:_ctl0:tbRegistrationFromYear" => $query->param("year"),
"Template:_ctl10:_ctl0:tbRegistrationToDay" => $query->param("day"),
"Template:_ctl10:_ctl0:tbRegistrationToMon" => $query->param("month"),
"Template:_ctl10:_ctl0:tbRegistrationToYear" => $query->param("year")});

# Output an HTTP response header
print $query->header(-type => "text/xml");

# Create an XML output stream
my $Writer = XML::Writer->new(DATA_MODE => 1);

# Output the XML header data
$Writer->xmlDecl("UTF-8");
$Writer->startTag("planning");
$Writer->dataElement("authority_name", "Dacorum Borough Council");
$Writer->dataElement("authority_short_name", "Dacorum");
$Writer->startTag("applications");

# Find the result table
my $table = $page->look_down("_tag" => "table", "class" => "FormDataGrid");

# Process each row of the results
foreach my $row ($table->look_down("_tag" => "tr"))
{
my @cells = $row->look_down("_tag" => "td");

if ($cells[0]->attr("class") eq "FormGridDataItem" ||
$cells[0]->attr("class") eq "FormGridAlternatingDataItem")
{
my $reference = $cells[0]->as_trimmed_text;
my $address = $cells[1]->as_trimmed_text;
my $description = $cells[2]->as_trimmed_text;
my $date = $cells[3]->as_trimmed_text;
my $postcode;

if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/)
{
$postcode = $1;
}

$Writer->startTag("application");
$Writer->dataElement("council_reference", $reference);
$Writer->dataElement("address", $address);
$Writer->dataElement("postcode", $postcode);
$Writer->dataElement("description", $description);
$Writer->dataElement("info_url", $InfoURL . $reference);
$Writer->dataElement("comment_url", $CommentURL . $reference);
$Writer->dataElement("date_received", $date);
$Writer->endTag("application");
}
}

# Finish off XML output
$Writer->endTag("applications");
$Writer->endTag("planning");
$Writer->end();

exit 0;

# Extract the state from a page so we can repost it
sub get_state
{
my $page = shift;
my $viewstate = $page->look_down("_tag" => "input", "name" => "__VIEWSTATE");

return $viewstate->attr("value");
}

# Post to the planning search page
sub do_post
{
my $response = $UA->post($SearchURL, @_);

die $response->status_line unless $response->is_success;

return HTML::TreeBuilder->new_from_content($response->content);
}

+ 122
- 0
trunk/cgi-bin/EastHerts.cgi Voir le fichier

@@ -0,0 +1,122 @@
#!/usr/bin/perl

use strict;
use warnings;

use CGI qw(:cgi);
use HTML::TreeBuilder;
use LWP::UserAgent;
use XML::Writer;

# The master URLs for the East Herts planning search
our $SearchURL = "http://e-services.eastherts.gov.uk/swiftlg/apas/run/WPHAPPCRITERIA";
our $InfoURL = "http://e-services.eastherts.gov.uk/swiftlg/apas/run/WPHAPPDETAIL.DisplayUrl?theApnID=";
our $CommentURL = "http://e-services.eastherts.gov.uk/swiftlg/apas/run/wphmakerep.displayURL?ApnID=";

# We're a CGI script...
my $query = CGI->new();

# Get the date to fetch
my $date = $query->param("day") . "/" . $query->param("month") . "/" . $query->param("year");

# Construct an LWP user agent
our $UA = LWP::UserAgent->new(env_proxy => 1);

# Do the search
my $page = do_post($SearchURL,
{"REGFROMDATE.MAINBODY.WPACIS.1." => $date,
"REGTODATE.MAINBODY.WPACIS.1." => $date,
"SEARCHBUTTON.MAINBODY.WPACIS.1." => "Search"});

# Output an HTTP response header
print $query->header(-type => "text/xml");

# Create an XML output stream
my $Writer = XML::Writer->new(DATA_MODE => 1);

# Output the XML header data
$Writer->xmlDecl("UTF-8");
$Writer->startTag("planning");
$Writer->dataElement("authority_name", "East Herts Council");
$Writer->dataElement("authority_short_name", "East Herts");
$Writer->startTag("applications");

# Output any applications on the first page
output_applications($page);

# Loop over any additional results pages
foreach my $link ($page->look_down("_tag" => "a", "href" => qr/^WPHAPPSEARCHRES\.displayResultsURL/))
{
# Fetch this page...
$page = do_get(URI->new_abs($link->attr("href"), $SearchURL));

# ...and output the applications from it
output_applications($page);
}

# Finish off XML output
$Writer->endTag("applications");
$Writer->endTag("planning");
$Writer->end();

exit 0;

# Make a GET request
sub do_get
{
my $response = $UA->get(@_);

die $response->status_line unless $response->is_success;

return HTML::TreeBuilder->new_from_content($response->content);
}

# Make a POST request
sub do_post
{
my $response = $UA->post(@_);

die $response->status_line unless $response->is_success;

return HTML::TreeBuilder->new_from_content($response->content);
}

# Output applications from a results page
sub output_applications
{
my $page = shift;

# Find the result table
my $table = $page->look_down("_tag" => "table", "cellspacing" => "2", "cellpadding" => "2");

# Process each row of the results
foreach my $row ($table->look_down("_tag" => "tr"))
{
my @cells = $row->look_down("_tag" => "td");

if (@cells >= 3)
{
my $reference = $cells[0]->as_trimmed_text;
my $description = $cells[1]->as_trimmed_text;
my $address = $cells[2]->as_trimmed_text;
my $postcode;

if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/)
{
$postcode = $1;
}

$Writer->startTag("application");
$Writer->dataElement("council_reference", $reference);
$Writer->dataElement("address", $address);
$Writer->dataElement("postcode", $postcode);
$Writer->dataElement("description", $description);
$Writer->dataElement("info_url", $InfoURL . $reference);
$Writer->dataElement("comment_url", $CommentURL . $reference);
$Writer->dataElement("date_received", $date);
$Writer->endTag("application");
}
}

return;
}

+ 120
- 0
trunk/cgi-bin/Enfield.cgi Voir le fichier

@@ -0,0 +1,120 @@
#!/usr/bin/perl

use strict;
use warnings;

use CGI qw(:cgi);
use HTML::TreeBuilder;
use LWP::UserAgent;
use XML::Writer;

# The master URLs for the Enfield planning search
our $SearchURL = "http://forms.enfield.gov.uk/swiftlg/apas/run/WPHAPPCRITERIA";
our $InfoURL = "http://forms.enfield.gov.uk/swiftlg/apas/run/WPHAPPDETAIL.DisplayUrl?theApnID=";

# We're a CGI script...
my $query = CGI->new();

# Get the date to fetch
my $date = $query->param("day") . "/" . $query->param("month") . "/" . $query->param("year");

# Construct an LWP user agent
our $UA = LWP::UserAgent->new(env_proxy => 1);

# Do the search
my $page = do_post($SearchURL,
{"REGFROMDATE.MAINBODY.WPACIS.1." => $date,
"REGTODATE.MAINBODY.WPACIS.1." => $date,
"SEARCHBUTTON.MAINBODY.WPACIS.1." => "Search"});

# Output an HTTP response header
print $query->header(-type => "text/xml");

# Create an XML output stream
my $Writer = XML::Writer->new(DATA_MODE => 1);

# Output the XML header data
$Writer->xmlDecl("UTF-8");
$Writer->startTag("planning");
$Writer->dataElement("authority_name", "Enfield Council");
$Writer->dataElement("authority_short_name", "Enfield");
$Writer->startTag("applications");

# Output any applications on the first page
output_applications($page);

# Loop over any additional results pages
foreach my $link ($page->look_down("_tag" => "a", "href" => qr/^WPHAPPSEARCHRES\.displayResultsURL/))
{
# Fetch this page...
$page = do_get(URI->new_abs($link->attr("href"), $SearchURL));

# ...and output the applications from it
output_applications($page);
}

# Finish off XML output
$Writer->endTag("applications");
$Writer->endTag("planning");
$Writer->end();

exit 0;

# Make a GET request
sub do_get
{
my $response = $UA->get(@_);

die $response->status_line unless $response->is_success;

return HTML::TreeBuilder->new_from_content($response->content);
}

# Make a POST request
sub do_post
{
my $response = $UA->post(@_);

die $response->status_line unless $response->is_success;

return HTML::TreeBuilder->new_from_content($response->content);
}

# Output applications from a results page
sub output_applications
{
my $page = shift;

# Find the result table
my $table = $page->look_down("_tag" => "table", "class" => "apas_tbl");

# Process each row of the results
foreach my $row ($table->look_down("_tag" => "tr"))
{
my @cells = $row->look_down("_tag" => "td");

if (@cells >= 3)
{
my $reference = $cells[0]->as_trimmed_text;
my $description = $cells[1]->as_trimmed_text;
my $address = $cells[2]->as_trimmed_text;
my $postcode;

if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/)
{
$postcode = $1;
}

$Writer->startTag("application");
$Writer->dataElement("council_reference", $reference);
$Writer->dataElement("address", $address);
$Writer->dataElement("postcode", $postcode);
$Writer->dataElement("description", $description);
$Writer->dataElement("info_url", $InfoURL . $reference);
$Writer->dataElement("date_received", $date);
$Writer->endTag("application");
}
}

return;
}

Chargement…
Annuler
Enregistrer