Add perl scraper from Peter for Newport.

Add scrapers for Merton and Conwy (PlanningExplorer)
16 år sedan · de05835fed
--- a/python_scrapers/Newport.cgi
+++ b/python_scrapers/Newport.cgi
@@ -0,0 +1,75 @@
 #!/usr/bin/perl -w

 use strict;
 use HTML::TreeBuilder;
 use File::Temp qw(tempfile);
 use LWP::Simple;
 use POSIX;
 use Encode;
 use CGI;
 use CGI::Carp;

 sub sanity_check {
 	my ($var) = @_;
 	defined $var or return 0;
 	$var =~ /^[0-9]+$/ or return 0;
 	return 1;
 }

 sub no_results {
 	my ($y, $m, $d, $reason) = @_;
 	print <<NIL;
 Content-type: text/xml

 <?xml version="1.0" encoding="UTF-8"?>
 <planning>
  <authority_name>Newport City Council</authority_name>
  <authority_short_name>Newport</authority_short_name>
  <applications>
  </applications>
 </planning>
 NIL
 	die "$y/$m/$d failed: $reason\n";
 }

 my $cgi = new CGI;

 my $year = $cgi->param("year");
 my $month = $cgi->param("month");
 my $day = $cgi->param("day");

 unless (sanity_check($year) && sanity_check($month) && sanity_check($day)) {
 	print <<ERROR;
 Content-type: text/plain

 Need year, month, day parameters
 ERROR
 	exit 0;
 }

 my $tree = HTML::TreeBuilder->new;
 # $tree->parse_file('weekly-planning-bw-lists.htm');
 $tree->parse(decode_utf8(get('http://www.newport.gov.uk/_dc/index.cfm?fuseaction=planapps.applist') or die "couldn't fetch index page"));
 $tree->eof;

 my $re = sprintf('Lists?\s+for %02d/%02d/%04d', $day, $month, $year);

 my ($day_p) = $tree->look_down(
 	"_tag", "p",
 	sub {  $_[0]->as_text =~ /$re/i }
 );
 $day_p or no_results($year, $month, $day, "Cannot find day paragraph");

 my ($day_link) = $day_p->find_by_tag_name("a");
 $day_link or no_results($year, $month, $day, "Cannot find day link");

 my $day_absurl = $day_link->attr('href');

 my ($fh, $filename) = tempfile(SUFFIX => ".pdf");
 print $fh get($day_absurl);
 close($fh);

 print "Content-type: text/xml\n\n";
 system "./Newport.pl", $filename, $day_absurl and die "system failed: $|";

 unlink $filename or die "cannot unlink temporary file $filename: $!";
--- a/python_scrapers/Newport.pl
+++ b/python_scrapers/Newport.pl
@@ -0,0 +1,66 @@
 #!/usr/bin/perl -w

 use strict;
 use XML::Writer;

 my $file = $ARGV[0];
 my $info_url = $ARGV[1];

 my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2);

 $writer->xmlDecl("UTF-8");

 $writer->startTag("planning");
 $writer->dataElement("authority_name", "Newport City Council");
 $writer->dataElement("authority_short_name", "Newport");
 $writer->startTag("applications");

 open (my $fh, '-|', "pdftotext", "-layout", $file, "-") or die "open failed: $!";
 while (my $line = <$fh>) {
 	if ($line =~ /^\s*App No:\s*(\S+)/) {
 		my $refno = $1;
 		my $address = ""; my $proposal = ""; my $date_received;
 		my $cur_field;
 		my $near_end;
 		while (1) {
 			chomp $line;
 			$line =~ s/^\s+//; $line =~ s/\s+$//;
 			if ($line =~ s/^ApplicationSite://) {
 				$cur_field = \$address;
 			} elsif ($line =~ s/^Proposal://) {
 				$cur_field = \$proposal;
 			} elsif (($line =~ s/^Applicant://) || ($line =~ s/^Agent://) || ($line =~ s/^App Type://)) {
 				$cur_field = undef;
 			} elsif ($line =~ /^Date Registered:\s*(\S+)/) {
 				$date_received = $1;
 				$cur_field = undef;
 			}
 			$line =~ s/^\s+//;
 			if (defined $cur_field) {
 				$$cur_field .= " " if $$cur_field ne "" and $line ne "";
 				$$cur_field .= $line;
 			}
 			last unless defined ($line = <$fh>);
 			last if $near_end && length $line == 1;
 			$near_end = 1 if $line =~ /^\s*Case Officer:/;
 		}
 		my $postcode = "None";
 		if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) {
 			$postcode = $1;
 		}

 		$writer->startTag("application");
 		$writer->dataElement("council_reference", $refno);
 		$writer->dataElement("address", $address);
 		$writer->dataElement("postcode", $postcode);
 		$writer->dataElement("description", $proposal);
 		$writer->dataElement("info_url", $info_url);
 		$writer->dataElement("comment_url", 'planning@newport.gov.uk');
 		$writer->dataElement("date_received", $date_received);
 		$writer->endTag;
 	}
 }

 $writer->endTag;
 $writer->endTag;
 $writer->end;
--- a/python_scrapers/OtherFilesToCopy.csv
+++ b/python_scrapers/OtherFilesToCopy.csv
@@ -37,3 +37,5 @@
 "Glasgow.cgi", "493"
 "Highland.pl", "493"
 "Highland.cgi", "493"
 "Newport.pl", "493"
 "Newport.cgi", "493"
--- a/python_scrapers/PlanningExplorer.py
+++ b/python_scrapers/PlanningExplorer.py
@@ -60,6 +60,10 @@ class PlanningExplorerParser:
    address_td_no = 1
    description_td_no = 2

    # In some cases we won't be able to get the full address/description/postcode without getting the info page for each app.
    # If fetch_info_page is set to true, then we need to get a copy of the info page and store it as an attribute on current_application (naughty!)
    fetch_info_page = False

    def _modify_response(self, response):
        """For most sites, we have managed to get all the apps on a
        single page by choosing the right parameters.
@@ -129,7 +133,20 @@ class PlanningExplorerParser:
        
        return post_data

    def _getPostCode(self):

    def _getAddress(self, tds, info_soup):
        # If this td contains a div, then the address is the
        # string in there - otherwise, use the string in the td.
        address_td = tds[self.address_td_no]
        if address_td.div is not None:
            address = address_td.div.string
        else:
            address = address_td.string
            
        return address


    def _getPostCode(self, info_soup):
        """In most cases, the postcode can be got from the address in 
        the results table. Some councils put the address there without the
        postcode. In this case we will have to go to the info page to get
@@ -138,6 +155,21 @@ class PlanningExplorerParser:

        return getPostcodeFromText(self._current_application.address)
        
    def _getDescription(self, tds, info_soup):
        description_td = tds[self.description_td_no]
        
        if description_td.div is not None:
            # Mostly this is in a div
            # Use the empty string if the description is missing
            description = description_td.div.string or ""
        else:
            # But sometimes (eg Crewe) it is directly in the td.
            # Use the empty string if the description is missing
            description = description_td.string or ""

        return description


    def __init__(self,
                 authority_name,
                 authority_short_name,
@@ -214,53 +246,39 @@ class PlanningExplorerParser:

                tds = tr.findAll("td")

                for td_no in range(len(tds)):
                    if td_no == self.reference_td_no:
                        # This td contains the reference number and a link to details
                        self._current_application.council_reference = tds[td_no].a.string

                        relative_info_url =  self._sanitiseInfoUrl(tds[td_no].a['href'])

                        self._current_application.info_url = urlparse.urljoin(self.info_url_base, relative_info_url)

                self._current_application.council_reference = tds[self.reference_td_no].a.string

                        # What about a comment url?
                        # There doesn't seem to be one, so we'll use the email address
                        if self.comments_email_address is not None:
                            # We're using the email address, as there doesn't seem
                            # to be a web form for comments
                            self._current_application.comment_url = self.comments_email_address
                        else:
                            # This link contains a code which we need for the comments url
                            # (on those sites that use it)
                            application_code = app_code_regex.search(relative_info_url).groups()[0]
                relative_info_url = self._sanitiseInfoUrl(tds[self.reference_td_no].a['href'])
                self._current_application.info_url = urlparse.urljoin(self.info_url_base, relative_info_url)

                            relative_comments_url = self.comments_path %(application_code)
                            self._current_application.comment_url = urlparse.urljoin(self.base_url, relative_comments_url)
                # Fetch the info page if we need it, otherwise set it to None

                    elif td_no == self.address_td_no:
                        # If this td contains a div, then the address is the
                        # string in there - otherwise, use the string in the td.
                        if tds[td_no].div is not None:
                            address = tds[td_no].div.string
                        else:
                            address = tds[td_no].string
                if self.fetch_info_page:
                    # We need to quote the spaces in the info url
                    info_request = urllib2.Request(urllib.quote(self._current_application.info_url, ":/&?="))
                    
                    info_soup = BeautifulSoup(urllib2.urlopen(info_request))
                else:
                    info_soup = None

                        self._current_application.address = address
                # What about a comment url?
                # There doesn't seem to be one, so we'll use the email address
                if self.comments_email_address is not None:
                    # We're using the email address, as there doesn't seem
                    # to be a web form for comments
                    self._current_application.comment_url = self.comments_email_address
                else:
                    # This link contains a code which we need for the comments url
                    # (on those sites that use it)
                    application_code = app_code_regex.search(relative_info_url).groups()[0]

                        self._current_application.postcode = self._getPostCode()
                    relative_comments_url = self.comments_path %(application_code)
                    self._current_application.comment_url = urlparse.urljoin(self.base_url, relative_comments_url)

                    elif td_no == self.description_td_no:
                        if tds[td_no].div is not None:
                            # Mostly this is in a div
                            # Use the empty string if the description is missing
                            description = tds[td_no].div.string or ""
                        else:
                            # But sometimes (eg Crewe) it is directly in the td.
                            # Use the empty string if the description is missing
                            description = tds[td_no].string or ""

                        self._current_application.description = description
                self._current_application.address = self._getAddress(tds, info_soup)
                self._current_application.postcode = self._getPostCode(info_soup)
                self._current_application.description = self._getDescription(tds, info_soup)

                self._results.addApplication(self._current_application)

@@ -470,11 +488,17 @@ class LiverpoolParser(PlanningExplorerParser):

        return ''.join(ws_re.split(url))

 # FIXME - Merton needs to be done here when it is back up.

 class MertonParser(PlanningExplorerParser):
    use_firefox_user_agent = True
    
    fetch_info_page = True

    def _getAddress(self, tds, info_soup):
        return info_soup.find(text="Site Address").findNext("td").string.strip()

    def _getDescription(self, tds, info_soup):
        return info_soup.find(text="Development Proposal").findNext("td").string.strip()


 class ShrewsburyParser(PlanningExplorerParser):
    use_firefox_user_agent = True

@@ -574,6 +598,16 @@ class WalthamForestParser(PlanningExplorerParser):
        print post_data
        return post_data

 class ConwyParser(BroadlandLike, PlanningExplorerParser):
    search_url_path = "Northgate/planningexplorerenglish/generalsearch.aspx"
    info_url_path = "Northgate/PlanningExplorerEnglish/Generic/"
    comments_path = "Northgate/PlanningExplorerEnglish/PLComments.aspx?pk=%s"

    use_firefox_user_agent = True


 #&txtApplicationNumber=&txtProposal=&txtSiteAddress=&cboWardCode=&cboParishCode=&cboApplicationTypeCode=&cboDevelopmentTypeCode=&cboStatusCode=&cboSelectDateValue=DATE_RECEIVED&cboMonths=1&cboDays=1&rbGroup=rbRange&dateStart=10%2F07%2F2008&dateEnd=20%2F07%2F2008&edrDateSelection=&csbtnSearch=Search


 #txtApplicantName=
 #txtAgentName=
@@ -597,29 +631,31 @@ if __name__ == '__main__':
    # NOTE - 04/11/2007 is a sunday
    # I'm using it to test that the scrapers behave on days with no apps.
    
    #parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/")
    #parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/")
    #parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/")
    #parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/")
    #parser = CreweParser("Crewe and Nantwich Borough Council", "Crewe and Nantwich", "http://portal.crewe-nantwich.gov.uk/")
    #parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/")
    #parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/")
    #parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/")
    #parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
    #parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/")
    #parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/")
    #parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/")
    #parser = ShrewsburyParser("Shrewsbury and Atcham Borough Council", "Shrewsbury", "http://www2.shrewsbury.gov.uk/")
    #parser = SouthNorfolkParser("South Norfolk Council", "South Norfolk", "http://planning.south-norfolk.gov.uk/")
    #parser = SouthShropshireParser("South Shropshire District Council", "South Shropshire", "http://194.201.44.102/")
    #parser = SouthTynesideParser("South Tyneside Council", "South Tyneside", "http://poppy.southtyneside.gov.uk/")
    parser = StockportParser("Stockport Metropolitan District Council", "Stockport", "http://s1.stockport.gov.uk/council/eed/dc/planning/")
    #parser = SwanseaParser("Swansea City and County Council", "Swansea", "http://www2.swansea.gov.uk/")
    #parser = TamworthParser("Tamworth Borough Council", "Tamworth", "http://80.1.64.77/")
    #parser = TraffordParser("Trafford Council", "Trafford", "http://planning.trafford.gov.uk/")
    #parser = WestOxfordshireParser("West Oxfordshire District Council", "West Oxfordshire", "http://planning.westoxon.gov.uk/")
    #parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/")
    print parser.getResults(18, 4, 2008)
    parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/")
 #    parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/")
 #    parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/")
 #    parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/")
 #    parser = CreweParser("Crewe and Nantwich Borough Council", "Crewe and Nantwich", "http://portal.crewe-nantwich.gov.uk/")
 #    parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/")
 #    parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/")
 #    parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/")
 #    parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
 #    parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/")
 #    parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/")
 #    parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/")
 #    parser = ShrewsburyParser("Shrewsbury and Atcham Borough Council", "Shrewsbury", "http://www2.shrewsbury.gov.uk/")
 #    parser = SouthNorfolkParser("South Norfolk Council", "South Norfolk", "http://planning.south-norfolk.gov.uk/")
 #    parser = SouthShropshireParser("South Shropshire District Council", "South Shropshire", "http://194.201.44.102/")
 #    parser = SouthTynesideParser("South Tyneside Council", "South Tyneside", "http://poppy.southtyneside.gov.uk/")
 #    parser = StockportParser("Stockport Metropolitan District Council", "Stockport", "http://s1.stockport.gov.uk/council/eed/dc/planning/")
 #    parser = SwanseaParser("Swansea City and County Council", "Swansea", "http://www2.swansea.gov.uk/")
 #    parser = TamworthParser("Tamworth Borough Council", "Tamworth", "http://80.1.64.77/")
 #    parser = TraffordParser("Trafford Council", "Trafford", "http://planning.trafford.gov.uk/")
 #    parser = WestOxfordshireParser("West Oxfordshire District Council", "West Oxfordshire", "http://planning.westoxon.gov.uk/")
 #    parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/")
 #    parser = ConwyParser("Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/")
 #    parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk")
    print parser.getResults(3, 7, 2008)

 # To Do

--- a/python_scrapers/SitesToGenerate.csv
+++ b/python_scrapers/SitesToGenerate.csv
@@ -243,3 +243,5 @@
 "London Borough of Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
 "Castle Point Borough Council", "Castle Point", "http://planning.castlepoint.gov.uk/portal/page?_pageid=35,38205&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
 "Great Yarmouth Borough Council", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly", "Ocella", "OcellaParser"
 "Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/", "PlanningExplorer", "ConwyParser"
 "London Borough of Merton", "Merton", "http://planning.merton.gov.uk", "PlanningExplorer", "MertonParser"