diff --git a/trunk/python_scrapers/AmberValley.py b/trunk/python_scrapers/AmberValley.py new file mode 100644 index 0000000..64fa382 --- /dev/null +++ b/trunk/python_scrapers/AmberValley.py @@ -0,0 +1,194 @@ +""" +This is the screenscraper for the planning applications in Amber Valley. + +We have to get the initial search page so that we can use the __VIEWSTATE +parameter. + +The start and end dates have to be separated by 1 day - I presume they are +interpreting dates as a datetime at midnight... + +BeautifulSoup doesn't seem to be able to cope with what comes back from the +post, so we'll use HTMLParser. + +The info reference link uses javascript (typical). As far as I can see there is no way to link directly to the info page for an application, so we'll just have to link to the search page. + +Bizarrely, the comment url is fine. e.g. + +http://www.ambervalley.gov.uk/services/environment/landandpremises/planningtownandcountry/planningapplications/planappcommentform.htm?frm_AppNum=AVA-2008-0955&frm_SiteAddress=147+Derby+Road%0dDuffield%0dBelper%0dDerbyshire%0dDE56+4FQ%0d&frm_Proposal=Rear+single+storey+extension+and+loft+conversion + +""" + +import urllib2 +import urllib +import urlparse + +import HTMLParser + +import datetime + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +#date_format = "%d/%m/%Y" + +class AmberValleyParser(HTMLParser.HTMLParser): + def __init__(self, *args): + + HTMLParser.HTMLParser.__init__(self) + + self._in_result_table = False + self._td_count = None + self._get_ref = False + self._get_description = False + + self.authority_name = "Amber Valley Borough Council" + self.authority_short_name = "Amber Valley" + self.base_url = "http://www.ambervalley.gov.uk/AVBC/Core/TemplateHandler.aspx?NRMODE=Published&NRNODEGUID=%7bAF862CF0-5C6D-4115-9979-5956B24D12DF%7d&NRORIGINALURL=%2fservices%2fenvironment%2flandandpremises%2fplanningtownandcountry%2fplanningapplications%2fPlanningApplicationRegister%2ehtm&NRCACHEHINT=Guest#filterbottom" + self.comment_url_template = "http://www.ambervalley.gov.uk/services/environment/landandpremises/planningtownandcountry/planningapplications/planappcommentform.htm?frm_AppNum=%(reference)s&frm_SiteAddress=%(address)s&frm_Proposal=%(description)s" + + self._current_application = None + self._search_date = None + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def handle_starttag(self, tag, attrs): + if tag == "table": + for key, value in attrs: + if key == "class" and value == "test": + self._current_application = PlanningApplication() + + # We can set the date_received immediately + self._current_application.date_received = self._search_date + + self._in_result_table = True + self._td_count = 0 + + break + + elif tag == "td": + if self._in_result_table: + self._td_count += 1 + self._get_description = False + elif tag == "a" and self._td_count == 1: + self._get_ref = True + + def handle_endtag(self, tag): + if tag == "table" and self._in_result_table: + self._current_application.description = self._current_application.description.strip() + self._current_application.address = ' '.join(self._current_application.address.strip().split()) + self._current_application.postcode = getPostcodeFromText(self._current_application.address) + self._current_application.info_url = self.base_url # Can't link to the info page, due to javascript idiocy. + self._current_application.comment_url = self.comment_url_template %{"reference": urllib.quote_plus(self._current_application.council_reference), + "address": urllib.quote_plus(self._current_application.address), + "description": urllib.quote_plus(self._current_application.description), + } + + self._results.addApplication(self._current_application) + + self._in_result_table = False + self._td_count = None + + if tag == "a": + self._get_ref = False + + def handle_startendtag(self, tag, attrs): + if tag == "br" and self._td_count == 2: + self._get_description = True + + def handle_data(self, data): + if self._get_ref == True: + self._current_application.council_reference = data + + elif self._td_count == 2: + # This td contains the address (including postcode) + # and the description + + if self._get_description: + # We have passed the
, and are looking for the description + if not self._current_application.description: + self._current_application.description = data + else: + self._current_application.description += data + else: + # We have not yet passed the
and are looking for the address and postcode. + if not self._current_application.address: + self._current_application.address = data + else: + self._current_application.address += data + + + def getResultsByDayMonthYear(self, day, month, year): + self._search_date = search_start_date = datetime.date(year, month, day) + search_end_date = search_start_date + datetime.timedelta(1) + + # Now get the search page + get_response = urllib2.urlopen(self.base_url) + + soup = BeautifulSoup(get_response.read()) + + form = soup.find("form", id="__aspnetForm") + + # We're going to need __VIEWSTATE for our post + viewstate = form.find("input", {"name":"__VIEWSTATE"})['value'] + action = form['action'] + + # Now we have what we need to do a POST + + post_url = urlparse.urljoin(self.base_url, action) + +# Example post data without the __VIEWSTATE + +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AtxbAppNumber= +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AtxbAddressKeyword= +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstDayStart=30 +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstMonthStart=Jul +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstYearStart=2008 +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstDayEnd=8 +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstMonthEnd=Aug +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstYearEnd=2008 +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3ArblDateType=0 +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstDistance= +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AtxbPostcode= +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstWards= +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstParishes= +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstOrderBy=RegisterDate+DESC +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3ArblViewType=List +# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AbtnQueryPlanApps=Lookup + + post_data = urllib.urlencode([ + ("__VIEWSTATE", viewstate), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:txbAppNumber", ""), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:txbAddressKeyword", ""), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstDayStart", search_start_date.day), # Using the attribute directly to avoid the leading 0 + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstMonthStart", search_start_date.strftime("%b")), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstYearStart", search_start_date.strftime("%Y")), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstDayEnd", search_end_date.day), # Using the attribute directly to avoid the leading 0 + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstMonthEnd", search_end_date.strftime("%b")), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstYearEnd", search_end_date.strftime("%Y")), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:rblDateType", "0"), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstDistance", ""), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:txbPostcode", ""), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstWards", ""), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstParishes", ""), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstOrderBy", "RegisterDate DESC"), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:rblViewType", "List"), + ("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:btnQueryPlanApps", "Lookup"), + ]) + + post_response = urllib2.urlopen(post_url, post_data) + + self.feed(post_response.read()) + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +if __name__ == '__main__': + parser = AmberValleyParser() + print parser.getResults(4,8,2008) + diff --git a/trunk/python_scrapers/OtherFilesToCopy.csv b/trunk/python_scrapers/OtherFilesToCopy.csv index acd5cb7..c84570e 100644 --- a/trunk/python_scrapers/OtherFilesToCopy.csv +++ b/trunk/python_scrapers/OtherFilesToCopy.csv @@ -42,3 +42,4 @@ "NorthAyrshire.cgi", "493" "Redbridge.pl", "493" "Redbridge.cgi", "493" +"AmberValley.py", "420" diff --git a/trunk/python_scrapers/SitesToGenerate.csv b/trunk/python_scrapers/SitesToGenerate.csv index 2c5a2aa..b1c4021 100644 --- a/trunk/python_scrapers/SitesToGenerate.csv +++ b/trunk/python_scrapers/SitesToGenerate.csv @@ -246,3 +246,4 @@ "Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/", "PlanningExplorer", "ConwyParser" "London Borough of Merton", "Merton", "http://planning.merton.gov.uk", "PlanningExplorer", "MertonParser" "London Borough of Enfield", "Enfield", "http://forms.enfield.gov.uk/swiftlg/apas/run/wphappcriteria.display", "SwiftLG", "SwiftLGParser" +"Amber Valley Borough Council", "Amber Valley", "", "AmberValley", "AmberValleyParser" diff --git a/trunk/python_scrapers/WAM.py b/trunk/python_scrapers/WAM.py index b605689..082574e 100644 --- a/trunk/python_scrapers/WAM.py +++ b/trunk/python_scrapers/WAM.py @@ -165,7 +165,6 @@ class BraintreeParser(WAMParser): if __name__ == '__main__': - #parser = WAMParser("Barking and Dagenham", "Barking and Dagenham", "http://idoxwam.lbbd.gov.uk:8081/WAM/pas/searchApplications.do", debug=True) #parser = BraintreeParser("Braintree", "Braintree", "http://planningapp.braintree.gov.uk/WAM1/weeklyApplications.do", debug=True) # Camden parser = WAMParser("Castle Point", "Castle Point", "http://wam.castlepoint.gov.uk/WAM/pas/searchApplications.do")#, debug=True)