From 5df7fe27c8e901f9c10b8b92891cd1da4796f993 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Thu, 7 Aug 2008 23:28:06 +0000 Subject: [PATCH] Add scraper for Amber Valley. This was a really irritating one. There is no way to link to individual info pages, so have had to give the search url in all the info urls. Comment url works fine for each app. --- python_scrapers/AmberValley.py | 194 +++++++++++++++++++++++++++ python_scrapers/OtherFilesToCopy.csv | 1 + python_scrapers/SitesToGenerate.csv | 1 + python_scrapers/WAM.py | 1 - 4 files changed, 196 insertions(+), 1 deletion(-) create mode 100644 python_scrapers/AmberValley.py diff --git a/python_scrapers/AmberValley.py b/python_scrapers/AmberValley.py new file mode 100644 index 0000000..64fa382 --- /dev/null +++ b/python_scrapers/AmberValley.py @@ -0,0 +1,194 @@ +""" +This is the screenscraper for the planning applications in Amber Valley. + +We have to get the initial search page so that we can use the __VIEWSTATE +parameter. + +The start and end dates have to be separated by 1 day - I presume they are +interpreting dates as a datetime at midnight... + +BeautifulSoup doesn't seem to be able to cope with what comes back from the +post, so we'll use HTMLParser. + +The info reference link uses javascript (typical). As far as I can see there is no way to link directly to the info page for an application, so we'll just have to link to the search page. + +Bizarrely, the comment url is fine. e.g. + +http://www.ambervalley.gov.uk/services/environment/landandpremises/planningtownandcountry/planningapplications/planappcommentform.htm?frm_AppNum=AVA-2008-0955&frm_SiteAddress=147+Derby+Road%0dDuffield%0dBelper%0dDerbyshire%0dDE56+4FQ%0d&frm_Proposal=Rear+single+storey+extension+and+loft+conversion + +""" + +import urllib2 +import urllib +import urlparse + +import HTMLParser + +import datetime + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +#date_format = "%d/%m/%Y" + +class AmberValleyParser(HTMLParser.HTMLParser): + def __init__(self, *args): + + HTMLParser.HTMLParser.__init__(self) + + self._in_result_table = False + self._td_count = None + self._get_ref = False + self._get_description = False + + self.authority_name = "Amber Valley Borough Council" + self.authority_short_name = "Amber Valley" + self.base_url = "http://www.ambervalley.gov.uk/AVBC/Core/TemplateHandler.aspx?NRMODE=Published&NRNODEGUID=%7bAF862CF0-5C6D-4115-9979-5956B24D12DF%7d&NRORIGINALURL=%2fservices%2fenvironment%2flandandpremises%2fplanningtownandcountry%2fplanningapplications%2fPlanningApplicationRegister%2ehtm&NRCACHEHINT=Guest#filterbottom" + self.comment_url_template = "http://www.ambervalley.gov.uk/services/environment/landandpremises/planningtownandcountry/planningapplications/planappcommentform.htm?frm_AppNum=%(reference)s&frm_SiteAddress=%(address)s&frm_Proposal=%(description)s" + + self._current_application = None + self._search_date = None + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def handle_starttag(self, tag, attrs): + if tag == "table": + for key, value in attrs: + if key == "class" and value == "test": + self._current_application = PlanningApplication() + + # We can set the date_received immediately + self._current_application.date_received = self._search_date + + self._in_result_table = True + self._td_count = 0 + + break + + elif tag == "td": + if self._in_result_table: + self._td_count += 1 + self._get_description = False + elif tag == "a" and self._td_count == 1: + self._get_ref = True + + def handle_endtag(self, tag): + if tag == "table" and self._in_result_table: + self._current_application.description = self._current_application.description.strip() + self._current_application.address = ' '.join(self._current_application.address.strip().split()) + self._current_application.postcode = getPostcodeFromText(self._current_application.address) + self._current_application.info_url = self.base_url # Can't link to the info page, due to javascript idiocy. + self._current_application.comment_url = self.comment_url_template %{"reference": urllib.quote_plus(self._current_application.council_reference), + "address": urllib.quote_plus(self._current_application.address), + "description": urllib.quote_plus(self._current_application.description), + } + + self._results.addApplication(self._current_application) + + self._in_result_table = False + self._td_count = None + + if tag == "a": + self._get_ref = False + + def handle_startendtag(self, tag, attrs): + if tag == "br" and self._td_count == 2: + self._get_description = True + + def handle_data(self, data): + if self._get_ref == True: + self._current_application.council_reference = data + + elif self._td_count == 2: + # This td contains the address (including postcode) + # and the description + + if self._get_description: + # We have passed the
, and are looking for the description + if not self._current_application.description: + self._current_application.description = data + else: + self._current_application.description += data + else: + # We have not yet passed the
and are looking for the address and postcode. + if not self._current_application.address: + self._current_application.address = data + else: + self._current_application.address += data + + + def getResultsByDayMonthYear(self, day, month, year): + self._search_date = search_start_date = datetime.date(year, month, day) + search_end_date = search_start_date + datetime.timedelta(1) + + # Now get the search page + get_response = urllib2.urlopen(self.base_url) + + soup = BeautifulSoup(get_response.read()) + + form = soup.find("form", id="__aspnetForm") + + # We're going to need __VIEWSTATE for our post + viewstate = form.find("input", {"name":"__VIEWSTATE"})['value'] + action = form['action'] + + # Now we have what we need to do a POST + + post_url = urlparse.urljoin(self.base_url, action) + +# Example post data without the __VIEWSTATE + +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AtxbAppNumber= +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AtxbAddressKeyword= +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstDayStart=30 +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstMonthStart=Jul +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstYearStart=2008 +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstDayEnd=8 +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstMonthEnd=Aug +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstYearEnd=2008 +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3ArblDateType=0 +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstDistance= +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AtxbPostcode= +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstWards= +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstParishes= +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstOrderBy=RegisterDate+DESC +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3ArblViewType=List +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AbtnQueryPlanApps=Lookup + + post_data = urllib.urlencode([ + ("__VIEWSTATE", viewstate), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:txbAppNumber", ""), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:txbAddressKeyword", ""), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstDayStart", search_start_date.day), # Using the attribute directly to avoid the leading 0 + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstMonthStart", search_start_date.strftime("%b")), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstYearStart", search_start_date.strftime("%Y")), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstDayEnd", search_end_date.day), # Using the attribute directly to avoid the leading 0 + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstMonthEnd", search_end_date.strftime("%b")), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstYearEnd", search_end_date.strftime("%Y")), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:rblDateType", "0"), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstDistance", ""), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:txbPostcode", ""), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstWards", ""), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstParishes", ""), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstOrderBy", "RegisterDate DESC"), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:rblViewType", "List"), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:btnQueryPlanApps", "Lookup"), + ]) + + post_response = urllib2.urlopen(post_url, post_data) + + self.feed(post_response.read()) + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +if __name__ == '__main__': + parser = AmberValleyParser() + print parser.getResults(4,8,2008) + diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv index acd5cb7..c84570e 100644 --- a/python_scrapers/OtherFilesToCopy.csv +++ b/python_scrapers/OtherFilesToCopy.csv @@ -42,3 +42,4 @@ "NorthAyrshire.cgi", "493" "Redbridge.pl", "493" "Redbridge.cgi", "493" +"AmberValley.py", "420" diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index 2c5a2aa..b1c4021 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -246,3 +246,4 @@ "Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/", "PlanningExplorer", "ConwyParser" "London Borough of Merton", "Merton", "http://planning.merton.gov.uk", "PlanningExplorer", "MertonParser" "London Borough of Enfield", "Enfield", "http://forms.enfield.gov.uk/swiftlg/apas/run/wphappcriteria.display", "SwiftLG", "SwiftLGParser" +"Amber Valley Borough Council", "Amber Valley", "", "AmberValley", "AmberValleyParser" diff --git a/python_scrapers/WAM.py b/python_scrapers/WAM.py index b605689..082574e 100644 --- a/python_scrapers/WAM.py +++ b/python_scrapers/WAM.py @@ -165,7 +165,6 @@ class BraintreeParser(WAMParser): if __name__ == '__main__': - #parser = WAMParser("Barking and Dagenham", "Barking and Dagenham", "http://idoxwam.lbbd.gov.uk:8081/WAM/pas/searchApplications.do", debug=True) #parser = BraintreeParser("Braintree", "Braintree", "http://planningapp.braintree.gov.uk/WAM1/weeklyApplications.do", debug=True) # Camden parser = WAMParser("Castle Point", "Castle Point", "http://wam.castlepoint.gov.uk/WAM/pas/searchApplications.do")#, debug=True)