From de05835fedd77853bbfb033135be87816764f490 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Thu, 31 Jul 2008 08:46:58 +0000 Subject: [PATCH] Add perl scraper from Peter for Newport. Add scrapers for Merton and Conwy (PlanningExplorer) --- python_scrapers/Newport.cgi | 75 ++++++++++++ python_scrapers/Newport.pl | 66 +++++++++++ python_scrapers/OtherFilesToCopy.csv | 2 + python_scrapers/PlanningExplorer.py | 170 ++++++++++++++++----------- python_scrapers/SitesToGenerate.csv | 2 + 5 files changed, 248 insertions(+), 67 deletions(-) create mode 100644 python_scrapers/Newport.cgi create mode 100644 python_scrapers/Newport.pl diff --git a/python_scrapers/Newport.cgi b/python_scrapers/Newport.cgi new file mode 100644 index 0000000..bfc1fd9 --- /dev/null +++ b/python_scrapers/Newport.cgi @@ -0,0 +1,75 @@ +#!/usr/bin/perl -w + +use strict; +use HTML::TreeBuilder; +use File::Temp qw(tempfile); +use LWP::Simple; +use POSIX; +use Encode; +use CGI; +use CGI::Carp; + +sub sanity_check { + my ($var) = @_; + defined $var or return 0; + $var =~ /^[0-9]+$/ or return 0; + return 1; +} + +sub no_results { + my ($y, $m, $d, $reason) = @_; + print < + + Newport City Council + Newport + + + +NIL + die "$y/$m/$d failed: $reason\n"; +} + +my $cgi = new CGI; + +my $year = $cgi->param("year"); +my $month = $cgi->param("month"); +my $day = $cgi->param("day"); + +unless (sanity_check($year) && sanity_check($month) && sanity_check($day)) { + print <new; +# $tree->parse_file('weekly-planning-bw-lists.htm'); +$tree->parse(decode_utf8(get('http://www.newport.gov.uk/_dc/index.cfm?fuseaction=planapps.applist') or die "couldn't fetch index page")); +$tree->eof; + +my $re = sprintf('Lists?\s+for %02d/%02d/%04d', $day, $month, $year); + +my ($day_p) = $tree->look_down( + "_tag", "p", + sub { $_[0]->as_text =~ /$re/i } +); +$day_p or no_results($year, $month, $day, "Cannot find day paragraph"); + +my ($day_link) = $day_p->find_by_tag_name("a"); +$day_link or no_results($year, $month, $day, "Cannot find day link"); + +my $day_absurl = $day_link->attr('href'); + +my ($fh, $filename) = tempfile(SUFFIX => ".pdf"); +print $fh get($day_absurl); +close($fh); + +print "Content-type: text/xml\n\n"; +system "./Newport.pl", $filename, $day_absurl and die "system failed: $|"; + +unlink $filename or die "cannot unlink temporary file $filename: $!"; diff --git a/python_scrapers/Newport.pl b/python_scrapers/Newport.pl new file mode 100644 index 0000000..5adf151 --- /dev/null +++ b/python_scrapers/Newport.pl @@ -0,0 +1,66 @@ +#!/usr/bin/perl -w + +use strict; +use XML::Writer; + +my $file = $ARGV[0]; +my $info_url = $ARGV[1]; + +my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2); + +$writer->xmlDecl("UTF-8"); + +$writer->startTag("planning"); +$writer->dataElement("authority_name", "Newport City Council"); +$writer->dataElement("authority_short_name", "Newport"); +$writer->startTag("applications"); + +open (my $fh, '-|', "pdftotext", "-layout", $file, "-") or die "open failed: $!"; +while (my $line = <$fh>) { + if ($line =~ /^\s*App No:\s*(\S+)/) { + my $refno = $1; + my $address = ""; my $proposal = ""; my $date_received; + my $cur_field; + my $near_end; + while (1) { + chomp $line; + $line =~ s/^\s+//; $line =~ s/\s+$//; + if ($line =~ s/^ApplicationSite://) { + $cur_field = \$address; + } elsif ($line =~ s/^Proposal://) { + $cur_field = \$proposal; + } elsif (($line =~ s/^Applicant://) || ($line =~ s/^Agent://) || ($line =~ s/^App Type://)) { + $cur_field = undef; + } elsif ($line =~ /^Date Registered:\s*(\S+)/) { + $date_received = $1; + $cur_field = undef; + } + $line =~ s/^\s+//; + if (defined $cur_field) { + $$cur_field .= " " if $$cur_field ne "" and $line ne ""; + $$cur_field .= $line; + } + last unless defined ($line = <$fh>); + last if $near_end && length $line == 1; + $near_end = 1 if $line =~ /^\s*Case Officer:/; + } + my $postcode = "None"; + if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) { + $postcode = $1; + } + + $writer->startTag("application"); + $writer->dataElement("council_reference", $refno); + $writer->dataElement("address", $address); + $writer->dataElement("postcode", $postcode); + $writer->dataElement("description", $proposal); + $writer->dataElement("info_url", $info_url); + $writer->dataElement("comment_url", 'planning@newport.gov.uk'); + $writer->dataElement("date_received", $date_received); + $writer->endTag; + } +} + +$writer->endTag; +$writer->endTag; +$writer->end; diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv index 55b8b0a..5233b54 100644 --- a/python_scrapers/OtherFilesToCopy.csv +++ b/python_scrapers/OtherFilesToCopy.csv @@ -37,3 +37,5 @@ "Glasgow.cgi", "493" "Highland.pl", "493" "Highland.cgi", "493" +"Newport.pl", "493" +"Newport.cgi", "493" diff --git a/python_scrapers/PlanningExplorer.py b/python_scrapers/PlanningExplorer.py index 8e01271..2da3961 100644 --- a/python_scrapers/PlanningExplorer.py +++ b/python_scrapers/PlanningExplorer.py @@ -60,6 +60,10 @@ class PlanningExplorerParser: address_td_no = 1 description_td_no = 2 + # In some cases we won't be able to get the full address/description/postcode without getting the info page for each app. + # If fetch_info_page is set to true, then we need to get a copy of the info page and store it as an attribute on current_application (naughty!) + fetch_info_page = False + def _modify_response(self, response): """For most sites, we have managed to get all the apps on a single page by choosing the right parameters. @@ -129,7 +133,20 @@ class PlanningExplorerParser: return post_data - def _getPostCode(self): + + def _getAddress(self, tds, info_soup): + # If this td contains a div, then the address is the + # string in there - otherwise, use the string in the td. + address_td = tds[self.address_td_no] + if address_td.div is not None: + address = address_td.div.string + else: + address = address_td.string + + return address + + + def _getPostCode(self, info_soup): """In most cases, the postcode can be got from the address in the results table. Some councils put the address there without the postcode. In this case we will have to go to the info page to get @@ -138,6 +155,21 @@ class PlanningExplorerParser: return getPostcodeFromText(self._current_application.address) + def _getDescription(self, tds, info_soup): + description_td = tds[self.description_td_no] + + if description_td.div is not None: + # Mostly this is in a div + # Use the empty string if the description is missing + description = description_td.div.string or "" + else: + # But sometimes (eg Crewe) it is directly in the td. + # Use the empty string if the description is missing + description = description_td.string or "" + + return description + + def __init__(self, authority_name, authority_short_name, @@ -214,53 +246,39 @@ class PlanningExplorerParser: tds = tr.findAll("td") - for td_no in range(len(tds)): - if td_no == self.reference_td_no: - # This td contains the reference number and a link to details - self._current_application.council_reference = tds[td_no].a.string - - relative_info_url = self._sanitiseInfoUrl(tds[td_no].a['href']) - - self._current_application.info_url = urlparse.urljoin(self.info_url_base, relative_info_url) - + self._current_application.council_reference = tds[self.reference_td_no].a.string - # What about a comment url? - # There doesn't seem to be one, so we'll use the email address - if self.comments_email_address is not None: - # We're using the email address, as there doesn't seem - # to be a web form for comments - self._current_application.comment_url = self.comments_email_address - else: - # This link contains a code which we need for the comments url - # (on those sites that use it) - application_code = app_code_regex.search(relative_info_url).groups()[0] + relative_info_url = self._sanitiseInfoUrl(tds[self.reference_td_no].a['href']) + self._current_application.info_url = urlparse.urljoin(self.info_url_base, relative_info_url) - relative_comments_url = self.comments_path %(application_code) - self._current_application.comment_url = urlparse.urljoin(self.base_url, relative_comments_url) + # Fetch the info page if we need it, otherwise set it to None - elif td_no == self.address_td_no: - # If this td contains a div, then the address is the - # string in there - otherwise, use the string in the td. - if tds[td_no].div is not None: - address = tds[td_no].div.string - else: - address = tds[td_no].string + if self.fetch_info_page: + # We need to quote the spaces in the info url + info_request = urllib2.Request(urllib.quote(self._current_application.info_url, ":/&?=")) + + info_soup = BeautifulSoup(urllib2.urlopen(info_request)) + else: + info_soup = None - self._current_application.address = address + # What about a comment url? + # There doesn't seem to be one, so we'll use the email address + if self.comments_email_address is not None: + # We're using the email address, as there doesn't seem + # to be a web form for comments + self._current_application.comment_url = self.comments_email_address + else: + # This link contains a code which we need for the comments url + # (on those sites that use it) + application_code = app_code_regex.search(relative_info_url).groups()[0] - self._current_application.postcode = self._getPostCode() + relative_comments_url = self.comments_path %(application_code) + self._current_application.comment_url = urlparse.urljoin(self.base_url, relative_comments_url) - elif td_no == self.description_td_no: - if tds[td_no].div is not None: - # Mostly this is in a div - # Use the empty string if the description is missing - description = tds[td_no].div.string or "" - else: - # But sometimes (eg Crewe) it is directly in the td. - # Use the empty string if the description is missing - description = tds[td_no].string or "" - self._current_application.description = description + self._current_application.address = self._getAddress(tds, info_soup) + self._current_application.postcode = self._getPostCode(info_soup) + self._current_application.description = self._getDescription(tds, info_soup) self._results.addApplication(self._current_application) @@ -470,11 +488,17 @@ class LiverpoolParser(PlanningExplorerParser): return ''.join(ws_re.split(url)) -# FIXME - Merton needs to be done here when it is back up. - class MertonParser(PlanningExplorerParser): use_firefox_user_agent = True - + fetch_info_page = True + + def _getAddress(self, tds, info_soup): + return info_soup.find(text="Site Address").findNext("td").string.strip() + + def _getDescription(self, tds, info_soup): + return info_soup.find(text="Development Proposal").findNext("td").string.strip() + + class ShrewsburyParser(PlanningExplorerParser): use_firefox_user_agent = True @@ -574,6 +598,16 @@ class WalthamForestParser(PlanningExplorerParser): print post_data return post_data +class ConwyParser(BroadlandLike, PlanningExplorerParser): + search_url_path = "Northgate/planningexplorerenglish/generalsearch.aspx" + info_url_path = "Northgate/PlanningExplorerEnglish/Generic/" + comments_path = "Northgate/PlanningExplorerEnglish/PLComments.aspx?pk=%s" + + use_firefox_user_agent = True + + +#&txtApplicationNumber=&txtProposal=&txtSiteAddress=&cboWardCode=&cboParishCode=&cboApplicationTypeCode=&cboDevelopmentTypeCode=&cboStatusCode=&cboSelectDateValue=DATE_RECEIVED&cboMonths=1&cboDays=1&rbGroup=rbRange&dateStart=10%2F07%2F2008&dateEnd=20%2F07%2F2008&edrDateSelection=&csbtnSearch=Search + #txtApplicantName= #txtAgentName= @@ -597,29 +631,31 @@ if __name__ == '__main__': # NOTE - 04/11/2007 is a sunday # I'm using it to test that the scrapers behave on days with no apps. - #parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/") - #parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/") - #parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/") - #parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/") - #parser = CreweParser("Crewe and Nantwich Borough Council", "Crewe and Nantwich", "http://portal.crewe-nantwich.gov.uk/") - #parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/") - #parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/") - #parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/") - #parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/") - #parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/") - #parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/") - #parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/") - #parser = ShrewsburyParser("Shrewsbury and Atcham Borough Council", "Shrewsbury", "http://www2.shrewsbury.gov.uk/") - #parser = SouthNorfolkParser("South Norfolk Council", "South Norfolk", "http://planning.south-norfolk.gov.uk/") - #parser = SouthShropshireParser("South Shropshire District Council", "South Shropshire", "http://194.201.44.102/") - #parser = SouthTynesideParser("South Tyneside Council", "South Tyneside", "http://poppy.southtyneside.gov.uk/") - parser = StockportParser("Stockport Metropolitan District Council", "Stockport", "http://s1.stockport.gov.uk/council/eed/dc/planning/") - #parser = SwanseaParser("Swansea City and County Council", "Swansea", "http://www2.swansea.gov.uk/") - #parser = TamworthParser("Tamworth Borough Council", "Tamworth", "http://80.1.64.77/") - #parser = TraffordParser("Trafford Council", "Trafford", "http://planning.trafford.gov.uk/") - #parser = WestOxfordshireParser("West Oxfordshire District Council", "West Oxfordshire", "http://planning.westoxon.gov.uk/") - #parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/") - print parser.getResults(18, 4, 2008) + parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/") +# parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/") +# parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/") +# parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/") +# parser = CreweParser("Crewe and Nantwich Borough Council", "Crewe and Nantwich", "http://portal.crewe-nantwich.gov.uk/") +# parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/") +# parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/") +# parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/") +# parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/") +# parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/") +# parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/") +# parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/") +# parser = ShrewsburyParser("Shrewsbury and Atcham Borough Council", "Shrewsbury", "http://www2.shrewsbury.gov.uk/") +# parser = SouthNorfolkParser("South Norfolk Council", "South Norfolk", "http://planning.south-norfolk.gov.uk/") +# parser = SouthShropshireParser("South Shropshire District Council", "South Shropshire", "http://194.201.44.102/") +# parser = SouthTynesideParser("South Tyneside Council", "South Tyneside", "http://poppy.southtyneside.gov.uk/") +# parser = StockportParser("Stockport Metropolitan District Council", "Stockport", "http://s1.stockport.gov.uk/council/eed/dc/planning/") +# parser = SwanseaParser("Swansea City and County Council", "Swansea", "http://www2.swansea.gov.uk/") +# parser = TamworthParser("Tamworth Borough Council", "Tamworth", "http://80.1.64.77/") +# parser = TraffordParser("Trafford Council", "Trafford", "http://planning.trafford.gov.uk/") +# parser = WestOxfordshireParser("West Oxfordshire District Council", "West Oxfordshire", "http://planning.westoxon.gov.uk/") +# parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/") +# parser = ConwyParser("Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/") +# parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk") + print parser.getResults(3, 7, 2008) # To Do diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index 2146514..d29e87c 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -243,3 +243,5 @@ "London Borough of Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser" "Castle Point Borough Council", "Castle Point", "http://planning.castlepoint.gov.uk/portal/page?_pageid=35,38205&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser" "Great Yarmouth Borough Council", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly", "Ocella", "OcellaParser" +"Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/", "PlanningExplorer", "ConwyParser" +"London Borough of Merton", "Merton", "http://planning.merton.gov.uk", "PlanningExplorer", "MertonParser"