Przeglądaj źródła

Add SouthSomerset, Christchurch and West Dorset.

We still need to sort out comment urls here.
import/raw
duncan.parkes 17 lat temu
rodzic
commit
593e9d8272
4 zmienionych plików z 462 dodań i 0 usunięć
  1. +148
    -0
      trunk/python_scrapers/Christchurch.cgi
  2. +3
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +163
    -0
      trunk/python_scrapers/SouthSomerset.cgi
  4. +148
    -0
      trunk/python_scrapers/WestDorset.cgi

+ 148
- 0
trunk/python_scrapers/Christchurch.cgi Wyświetl plik

@@ -0,0 +1,148 @@
#!/usr/bin/perl

use strict;
use warnings;

use CGI qw(:cgi);
use HTML::TreeBuilder;
use LWP::UserAgent;
use XML::Writer;


# The master URLs for the Christchurch planning search
our $SearchURL = "http://webapps.christchurch.gov.uk/PlanningApplications/pages/ApplicationSearch.aspx";
our $InfoURL = "http://webapps.chirstchurch.gov.uk/PlanningApplications/pages/ApplicationDetails.aspx?Authority=Christchurch%20Borough%20Council&Application=";

# We're a CGI script...
my $query = CGI->new();

# Get the date to fetch
my $date = $query->param("day") . "/" . $query->param("month") . "/" . $query->param("year");

# Construct an LWP user agent
our $UA = LWP::UserAgent->new(env_proxy => 1);

# Post the URL to get an initial blank form
my $page = do_post();

# Do the search
$page = do_post($page,
{"DetailedSearch:TextBox_DateRaisedFrom" => $date,
"DetailedSearch:TextBox_DateRaisedTo" => $date,
"QuickSearchApplicationNumber:Button_SearchApplicationNumber" => "Search"});

# Output an HTTP response header
print $query->header(-type => "text/xml");

# Create an XML output stream
my $Writer = XML::Writer->new(DATA_MODE => 1);

# Output the XML header data
$Writer->xmlDecl("UTF-8");
$Writer->startTag("planning");
$Writer->dataElement("authority_name", "Christchurch Council");
$Writer->dataElement("authority_short_name", "Christchurch");
$Writer->startTag("applications");

# Output any applications on the first page
output_applications($page);

# Loop over any additional results pages
while (my $link = $page->look_down("_tag" => "a", "id" => "MatchingApplications_ResultsNavigationTop_LinkButton_Next"))
{
# Fetch this page...
$page = do_post_back($page, 'MatchingApplications$ResultsNavigationTop$LinkButton_Next', '');

# ...and output the applications from it
output_applications($page);
}

# Finish off XML output
$Writer->endTag("applications");
$Writer->endTag("planning");
$Writer->end();

exit 0;

# Fake up what the doPostBack javascript function in the page does...
sub do_post_back
{
my $previous = shift;
my $target = shift;
my $argument = shift;

$target =~ s/\$/:/g;

my $args = {
"__EVENTTARGET" => $target,
"__EVENTARGUMENT" => $argument,
};

return do_post($previous, $args);
}

# Make a POST request
sub do_post
{
my $previous = shift;
my $args = shift || {};

if (defined($previous))
{
my $viewstate = $previous->look_down("_tag" => "input", "name" => "__VIEWSTATE");
# my $eventvalidation = $previous->look_down("_tag" => "input", "name" => "__EVENTVALIDATION");

$args->{"__VIEWSTATE"} = $viewstate->attr("value");
# $args->{"__EVENTVALIDATION"} = $eventvalidation->attr("value");
}

my $response = $UA->post($SearchURL, $args);

die $response->status_line unless $response->is_success;

return HTML::TreeBuilder->new_from_content($response->content);
}

# Output applications from a results page
sub output_applications
{
my $page = shift;

# Find the result table
my $table = $page->look_down("_tag" => "table", "class" => "searchresults");

# No results means no results table
if (defined($table))
{
# Process each row of the results
foreach my $row ($table->look_down("_tag" => "tr"))
{
my $class = $row->attr("class") || "";

next if $class eq "searchresultsheader";

my @cells = $row->look_down("_tag" => "td");
my $reference = $cells[0]->as_trimmed_text;
my $date = $cells[1]->as_trimmed_text;
my $address = $cells[2]->as_trimmed_text;
my $description = $cells[3]->as_trimmed_text;
my $postcode;

if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/)
{
$postcode = $1;
}

$Writer->startTag("application");
$Writer->dataElement("council_reference", $reference);
$Writer->dataElement("address", $address);
$Writer->dataElement("postcode", $postcode);
$Writer->dataElement("description", $description);
$Writer->dataElement("info_url", $InfoURL . $reference);
$Writer->dataElement("date_received", $date);
$Writer->endTag("application");
}
}

return;
}

+ 3
- 0
trunk/python_scrapers/OtherFilesToCopy.csv Wyświetl plik

@@ -16,3 +16,6 @@
"PlanningExplorer.py", "420"
"SwiftLG.py", "420"
"Dacorum.cgi", "493"
"SouthSomerset.cgi", "493"
"WestDorset.cgi", "493"
"Christchurch.cgi", "493"

+ 163
- 0
trunk/python_scrapers/SouthSomerset.cgi Wyświetl plik

@@ -0,0 +1,163 @@
#!/usr/bin/perl

use strict;
use warnings;

use CGI qw(:cgi);
use HTML::TreeBuilder;
use LWP::UserAgent;
use XML::Writer;

# Month names
our %Months = ( 1 => "Jan", 2 => "Feb", 3 => "Mar", 4 => "Apr",
5 => "May", 5 => "Jun", 7 => "Jul", 8 => "Aug",
9 => "Sep", 10 => "Oct", 11 => "Nov", 12 => "Dec" );

# The master URLs for the Enfield planning search
our $StartURL = "http://www.southsomerset.gov.uk/index.jsp?articleid=1925&page_name=startsearch";
our $SearchURL = "http://www.southsomerset.gov.uk/index.jsp?articleid=1925&page_name=searchresults";

# We're a CGI script...
my $query = CGI->new();

# Get the date to fetch
my $date = $query->param("day") . "-" . $Months{$query->param("month")} . "-" . $query->param("year");

# Construct an LWP user agent
our $UA = LWP::UserAgent->new(env_proxy => 1, cookie_jar => {});

# Post acceptance of terms and conditions to get a cookie
do_post($StartURL, {"acceptTC" => "on"});

# Do the search
my $page = do_post($SearchURL,
{"startdate" => "12-Nov-2007", #$date,
"enddate" => $date,
"datesearch" => "applications",
"timeframe" => "yearonly",
"btnsubmit" => "search",
"address" => "",
"area" => "",
"caseno" => "",
"decision" => "",
"location" => "",
"parish" => "",
"postcode" => "",
"recentweeks" => "",
"ward" => ""});

# Output an HTTP response header
print $query->header(-type => "text/xml");

# Create an XML output stream
my $Writer = XML::Writer->new(DATA_MODE => 1);

# Output the XML header data
$Writer->xmlDecl("UTF-8");
$Writer->startTag("planning");
$Writer->dataElement("authority_name", "South Somerset District Council");
$Writer->dataElement("authority_short_name", "South Somerset");
$Writer->startTag("applications");

# Output any applications on the first page
output_applications($page);

# Loop over any additional results pages
while (my $link = $page->look_down("_tag" => "a", sub { $_[0]->as_text eq "Next Page" }))
{
# Fetch this page...
$page = do_get(URI->new_abs($link->attr("href"), $SearchURL));

# ...and output the applications from it
output_applications($page);
}

# Finish off XML output
$Writer->endTag("applications");
$Writer->endTag("planning");
$Writer->end();

exit 0;

# Make a GET request
sub do_get
{
my $response = $UA->get(@_);

die $response->status_line unless $response->is_success;

return HTML::TreeBuilder->new_from_content($response->content);
}

# Make a POST request
sub do_post
{
my $response = $UA->post(@_);

die $response->status_line unless $response->is_success;

return HTML::TreeBuilder->new_from_content($response->content);
}

# Output applications from a results page
sub output_applications
{
my $page = shift;
my $reference;
my $address;
my $postcode;
my $description;
my $date_received;
my $info_url;

# Find the result table
my $table = $page->look_down("_tag" => "div", "class" => "mainText")->look_down("_tag" => "table");

# Process each row of the results
foreach my $row ($table->look_down("_tag" => "tr"))
{
my @cells = $row->look_down("_tag" => "td");

if (@cells == 1 && $cells[0]->look_down("_tag" => "hr"))
{
if (defined($reference))
{
$Writer->startTag("application");
$Writer->dataElement("council_reference", $reference);
$Writer->dataElement("address", $address);
$Writer->dataElement("postcode", $postcode);
$Writer->dataElement("description", $description);
$Writer->dataElement("info_url", $info_url);
$Writer->dataElement("date_received", $date_received);
$Writer->endTag("application");
}

undef $reference;
undef $address;
undef $postcode;
undef $description;
undef $date_received;
undef $info_url
}
elsif (@cells == 1 && defined($reference))
{
$description = $cells[0]->as_trimmed_text;

$description =~ s/^Proposal:\s*//;
}
elsif (@cells == 5 && $cells[0]->as_trimmed_text =~ /^\d+/)
{
$reference = $cells[0]->as_trimmed_text;
$date_received = $cells[1]->as_trimmed_text;
$address = $cells[2]->as_trimmed_text;
$info_url = URI->new_abs($cells[4]->look_down("_tag" => "a")->attr("href"), $SearchURL);

if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/)
{
$postcode = $1;
}
}
}

return;
}

+ 148
- 0
trunk/python_scrapers/WestDorset.cgi Wyświetl plik

@@ -0,0 +1,148 @@
#!/usr/bin/perl

use strict;
use warnings;

use CGI qw(:cgi);
use HTML::TreeBuilder;
use LWP::UserAgent;
use XML::Writer;


# The master URLs for the West Dorset planning search
our $SearchURL = "http://webapps.westdorset-dc.gov.uk/planningapplications/pages/applicationsearch.aspx";
our $InfoURL = "http://webapps.westdorset-dc.gov.uk/planningapplications/pages/ApplicationDetails.aspx?Authority=West%20Dorset%20District%20Council&Application=";

# We're a CGI script...
my $query = CGI->new();

# Get the date to fetch
my $date = $query->param("day") . "/" . $query->param("month") . "/" . $query->param("year");

# Construct an LWP user agent
our $UA = LWP::UserAgent->new(env_proxy => 1);

# Post the URL to get an initial blank form
my $page = do_post();

# Do the search
$page = do_post($page,
{"DetailedSearch\$TextBox_DateRaisedFrom" => $date,
"DetailedSearch\$TextBox_DateRaisedTo" => $date,
"DetailedSearch\$Button_DetailedSearch" => "Search"});

# Output an HTTP response header
print $query->header(-type => "text/xml");

# Create an XML output stream
my $Writer = XML::Writer->new(DATA_MODE => 1);

# Output the XML header data
$Writer->xmlDecl("UTF-8");
$Writer->startTag("planning");
$Writer->dataElement("authority_name", "West Dorset District Council");
$Writer->dataElement("authority_short_name", "West Dorset");
$Writer->startTag("applications");

# Output any applications on the first page
output_applications($page);

# Loop over any additional results pages
while (my $link = $page->look_down("_tag" => "a", "id" => "MatchingApplications_ResultsNavigationTop_LinkButton_Next"))
{
# Fetch this page...
$page = do_post_back($page, 'MatchingApplications$ResultsNavigationTop$LinkButton_Next', '');

# ...and output the applications from it
output_applications($page);
}

# Finish off XML output
$Writer->endTag("applications");
$Writer->endTag("planning");
$Writer->end();

exit 0;

# Fake up what the doPostBack javascript function in the page does...
sub do_post_back
{
my $previous = shift;
my $target = shift;
my $argument = shift;

$target =~ s/\$/:/g;

my $args = {
"__EVENTTARGET" => $target,
"__EVENTARGUMENT" => $argument,
};

return do_post($previous, $args);
}

# Make a POST request
sub do_post
{
my $previous = shift;
my $args = shift || {};

if (defined($previous))
{
my $viewstate = $previous->look_down("_tag" => "input", "name" => "__VIEWSTATE");
my $eventvalidation = $previous->look_down("_tag" => "input", "name" => "__EVENTVALIDATION");

$args->{"__VIEWSTATE"} = $viewstate->attr("value");
$args->{"__EVENTVALIDATION"} = $eventvalidation->attr("value");
}

my $response = $UA->post($SearchURL, $args);

die $response->status_line unless $response->is_success;

return HTML::TreeBuilder->new_from_content($response->content);
}

# Output applications from a results page
sub output_applications
{
my $page = shift;

# Find the result table
my $table = $page->look_down("_tag" => "table", "class" => "searchresults");

# No results means no results table
if (defined($table))
{
# Process each row of the results
foreach my $row ($table->look_down("_tag" => "tr"))
{
my $class = $row->attr("class") || "";

next if $class eq "searchresultsheader";

my @cells = $row->look_down("_tag" => "td");
my $reference = $cells[0]->as_trimmed_text;
my $date = $cells[1]->as_trimmed_text;
my $address = $cells[2]->as_trimmed_text;
my $description = $cells[3]->as_trimmed_text;
my $postcode;

if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/)
{
$postcode = $1;
}

$Writer->startTag("application");
$Writer->dataElement("council_reference", $reference);
$Writer->dataElement("address", $address);
$Writer->dataElement("postcode", $postcode);
$Writer->dataElement("description", $description);
$Writer->dataElement("info_url", $InfoURL . $reference);
$Writer->dataElement("date_received", $date);
$Writer->endTag("application");
}
}

return;
}

Ładowanie…
Anuluj
Zapisz