소스 검색

Add python parser for West Dorset, and remove the non-working perl one.

master
duncan.parkes 16 년 전
부모
커밋
15fca1a280
4개의 변경된 파일90개의 추가작업 그리고 150개의 파일을 삭제
  1. +1
    -1
      python_scrapers/OtherFilesToCopy.csv
  2. +1
    -0
      python_scrapers/SitesToGenerate.csv
  3. +0
    -149
      python_scrapers/WestDorset.cgi
  4. +88
    -0
      python_scrapers/WestDorset.py

+ 1
- 1
python_scrapers/OtherFilesToCopy.csv 파일 보기

@@ -16,7 +16,6 @@
"SwiftLG.py", "420"
"Dacorum.cgi", "493"
"SouthSomerset.cgi", "493"
"WestDorset.cgi", "493"
"Christchurch.cgi", "493"
"WAM.py", "420"
"Planet.py", "420"
@@ -60,3 +59,4 @@
"Exmoor.py", "420"
"Eastbourne.py", "420"
"Gosport.py", "420"
"WestDorset.py", "420"

+ 1
- 0
python_scrapers/SitesToGenerate.csv 파일 보기

@@ -265,3 +265,4 @@
"Waltham Forest Council", "Waltham Forest", "http://planning.walthamforest.gov.uk/", "PlanningExplorer", "WalthamForestParser"
"Caerphilly County Borough Council", "Caerphilly", "http://publicaccess.caerphilly.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"Gosport Borough Council", "Gosport", "", "Gosport", "GosportParser"
"West Dorset District Council", "West Dorset", "", "WestDorset", "WestDorsetParser"

+ 0
- 149
python_scrapers/WestDorset.cgi 파일 보기

@@ -1,149 +0,0 @@
#!/usr/bin/perl

use strict;
use warnings;

use CGI qw(:cgi);
use HTML::TreeBuilder;
use LWP::UserAgent;
use XML::Writer;


# The master URLs for the West Dorset planning search
our $SearchURL = "http://webapps.westdorset-dc.gov.uk/planningapplications/pages/applicationsearch.aspx";
our $InfoURL = "http://webapps.westdorset-dc.gov.uk/planningapplications/pages/ApplicationDetails.aspx?Authority=West%20Dorset%20District%20Council&Application=";

# We're a CGI script...
my $query = CGI->new();

# Get the date to fetch
my $date = $query->param("day") . "/" . $query->param("month") . "/" . $query->param("year");

# Construct an LWP user agent
our $UA = LWP::UserAgent->new(env_proxy => 1);

# Post the URL to get an initial blank form
my $page = do_post();

# Do the search
$page = do_post($page,
{"DetailedSearch\$TextBox_DateRaisedFrom" => $date,
"DetailedSearch\$TextBox_DateRaisedTo" => $date,
"DetailedSearch\$Button_DetailedSearch" => "Search"});

# Output an HTTP response header
print $query->header(-type => "text/xml");

# Create an XML output stream
my $Writer = XML::Writer->new(DATA_MODE => 1);

# Output the XML header data
$Writer->xmlDecl("UTF-8");
$Writer->startTag("planning");
$Writer->dataElement("authority_name", "West Dorset District Council");
$Writer->dataElement("authority_short_name", "West Dorset");
$Writer->startTag("applications");

# Output any applications on the first page
output_applications($page);

# Loop over any additional results pages
while (my $link = $page->look_down("_tag" => "a", "id" => "MatchingApplications_ResultsNavigationTop_LinkButton_Next"))
{
# Fetch this page...
$page = do_post_back($page, 'MatchingApplications$ResultsNavigationTop$LinkButton_Next', '');

# ...and output the applications from it
output_applications($page);
}

# Finish off XML output
$Writer->endTag("applications");
$Writer->endTag("planning");
$Writer->end();

exit 0;

# Fake up what the doPostBack javascript function in the page does...
sub do_post_back
{
my $previous = shift;
my $target = shift;
my $argument = shift;

$target =~ s/\$/:/g;

my $args = {
"__EVENTTARGET" => $target,
"__EVENTARGUMENT" => $argument,
};

return do_post($previous, $args);
}

# Make a POST request
sub do_post
{
my $previous = shift;
my $args = shift || {};

if (defined($previous))
{
my $viewstate = $previous->look_down("_tag" => "input", "name" => "__VIEWSTATE");
my $eventvalidation = $previous->look_down("_tag" => "input", "name" => "__EVENTVALIDATION");

$args->{"__VIEWSTATE"} = $viewstate->attr("value");
$args->{"__EVENTVALIDATION"} = $eventvalidation->attr("value");
}

my $response = $UA->post($SearchURL, $args);

die $response->status_line unless $response->is_success;

return HTML::TreeBuilder->new_from_content($response->content);
}

# Output applications from a results page
sub output_applications
{
my $page = shift;

# Find the result table
my $table = $page->look_down("_tag" => "table", "class" => "searchresults");

# No results means no results table
if (defined($table))
{
# Process each row of the results
foreach my $row ($table->look_down("_tag" => "tr"))
{
my $class = $row->attr("class") || "";

next if $class eq "searchresultsheader";

my @cells = $row->look_down("_tag" => "td");
my $reference = $cells[0]->as_trimmed_text;
my $date = $cells[1]->as_trimmed_text;
my $address = $cells[2]->as_trimmed_text;
my $description = $cells[3]->as_trimmed_text;
my $postcode;

if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/)
{
$postcode = $1;
}

$Writer->startTag("application");
$Writer->dataElement("council_reference", $reference);
$Writer->dataElement("address", $address);
$Writer->dataElement("postcode", $postcode);
$Writer->dataElement("description", $description);
$Writer->dataElement("info_url", $InfoURL . $reference);
$Writer->dataElement("comment_url", $InfoURL . $reference);
$Writer->dataElement("date_received", $date);
$Writer->endTag("application");
}
}

return;
}

+ 88
- 0
python_scrapers/WestDorset.py 파일 보기

@@ -0,0 +1,88 @@
import urllib2
import urllib
import datetime
import re

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class WestDorsetParser:
def __init__(self, *args):

self.authority_name = "West Dorset District Council"
self.authority_short_name = "West Dorset"

self.base_url = "http://webapps.westdorset-dc.gov.uk/planningapplications/pages/applicationsearch.aspx"
self.info_url = "http://webapps.westdorset-dc.gov.uk/planningapplications/pages/ApplicationDetails.aspx?Application=%s&Authority=West+Dorset+District+Council+"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

get_response = urllib2.urlopen(self.base_url)
get_soup = BeautifulSoup(get_response.read())

post_data = (
("__VIEWSTATE", get_soup.find("input", id="__VIEWSTATE")["value"]),
# ("QuickSearchApplicationNumber$TextBox_ApplicationNumber", ""),
# ("QuickSearchThisWeek$DropDownList_PastWeek", ""),
# ("DetailedSearch$TextBox_PropertyNameNumber", ""),
# ("DetailedSearch$Textbox_StreetName", ""),
# ("DetailedSearch$Textbox_TownVillage", ""),
# ("DetailedSearch$Textbox_Postcode", ""),
# ("DetailedSearch$Textbox_Parish", ""),
# ("DetailedSearch$Textbox_ApplicantSurname", ""),
# ("DetailedSearch$TextBox_AgentName", ""),
("DetailedSearch$TextBox_DateRaisedFrom", search_date.strftime(date_format)),
("DetailedSearch$TextBox_DateRaisedTo", search_date.strftime(date_format)),
# ("DetailedSearch$TextBox_DecisionFrom", "dd%2Fmm%2Fyyyy"),
# ("DetailedSearch$TextBox_DecisionTo", "dd%2Fmm%2Fyyyy"),
("DetailedSearch$Button_DetailedSearch", "Search"),
("__EVENTVALIDATION", get_soup.find("input", id="__EVENTVALIDATION")["value"]),
)

# The response to the GET is a redirect. We'll need to post to the new url.
post_response = urllib2.urlopen(get_response.url, urllib.urlencode(post_data))
post_soup = BeautifulSoup(post_response.read())

if not post_soup.find(text = re.compile("No matching record")):
# The first row contains headers.
trs = post_soup.find("table", {"class": "searchresults"}).findAll("tr")[1:]

for tr in trs:
application = PlanningApplication()

# We can fill the date received in straight away from the date we searched for.
application.date_received = search_date

tds = tr.findAll("td")

application.council_reference = tds[0].font.string.strip()
application.address = tds[2].font.string.strip()
application.postcode = getPostcodeFromText(application.address)
application.description = tds[3].font.string.strip()

# Set the info url and the comment url to be the same - can't get to the comment
# one directly without javascript.
application.info_url = self.info_url %(application.council_reference)
application.comment_url = application.info_url

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = WestDorsetParser()
print parser.getResults(1,10,2008)


불러오는 중...
취소
저장