"""
This is the screenscraper for the planning applications in Amber Valley.
We have to get the initial search page so that we can use the __VIEWSTATE
parameter.
The start and end dates have to be separated by 1 day - I presume they are
interpreting dates as a datetime at midnight...
BeautifulSoup doesn't seem to be able to cope with what comes back from the
post, so we'll use HTMLParser.
The info reference link uses javascript (typical). As far as I can see there is no way to link directly to the info page for an application, so we'll just have to link to the search page.
Bizarrely, the comment url is fine. e.g.
http://www.ambervalley.gov.uk/services/environment/landandpremises/planningtownandcountry/planningapplications/planappcommentform.htm?frm_AppNum=AVA-2008-0955&frm_SiteAddress=147+Derby+Road%0dDuffield%0dBelper%0dDerbyshire%0dDE56+4FQ%0d&frm_Proposal=Rear+single+storey+extension+and+loft+conversion
"""
import urllib2
import urllib
import urlparse
import HTMLParser
import datetime
from BeautifulSoup import BeautifulSoup
from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText
#date_format = "%d/%m/%Y"
class AmberValleyParser(HTMLParser.HTMLParser):
def __init__(self, *args):
HTMLParser.HTMLParser.__init__(self)
self._in_result_table = False
self._td_count = None
self._get_ref = False
self._get_description = False
self.authority_name = "Amber Valley Borough Council"
self.authority_short_name = "Amber Valley"
self.base_url = "http://www.ambervalley.gov.uk/AVBC/Core/TemplateHandler.aspx?NRMODE=Published&NRNODEGUID=%7bAF862CF0-5C6D-4115-9979-5956B24D12DF%7d&NRORIGINALURL=%2fservices%2fenvironment%2flandandpremises%2fplanningtownandcountry%2fplanningapplications%2fPlanningApplicationRegister%2ehtm&NRCACHEHINT=Guest#filterbottom"
self.comment_url_template = "http://www.ambervalley.gov.uk/services/environment/landandpremises/planningtownandcountry/planningapplications/planappcommentform.htm?frm_AppNum=%(reference)s&frm_SiteAddress=%(address)s&frm_Proposal=%(description)s"
self._current_application = None
self._search_date = None
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
def handle_starttag(self, tag, attrs):
if tag == "table":
for key, value in attrs:
if key == "class" and value == "test":
self._current_application = PlanningApplication()
# We can set the date_received immediately
self._current_application.date_received = self._search_date
self._in_result_table = True
self._td_count = 0
break
elif tag == "td":
if self._in_result_table:
self._td_count += 1
self._get_description = False
elif tag == "a" and self._td_count == 1:
self._get_ref = True
def handle_endtag(self, tag):
if tag == "table" and self._in_result_table:
self._current_application.description = self._current_application.description.strip()
self._current_application.address = ' '.join(self._current_application.address.strip().split())
self._current_application.postcode = getPostcodeFromText(self._current_application.address)
self._current_application.info_url = self.base_url # Can't link to the info page, due to javascript idiocy.
self._current_application.comment_url = self.comment_url_template %{"reference": urllib.quote_plus(self._current_application.council_reference),
"address": urllib.quote_plus(self._current_application.address),
"description": urllib.quote_plus(self._current_application.description),
}
self._results.addApplication(self._current_application)
self._in_result_table = False
self._td_count = None
if tag == "a":
self._get_ref = False
def handle_startendtag(self, tag, attrs):
if tag == "br" and self._td_count == 2:
self._get_description = True
def handle_data(self, data):
if self._get_ref == True:
self._current_application.council_reference = data
elif self._td_count == 2:
# This td contains the address (including postcode)
# and the description
if self._get_description:
# We have passed the
, and are looking for the description
if not self._current_application.description:
self._current_application.description = data
else:
self._current_application.description += data
else:
# We have not yet passed the
and are looking for the address and postcode.
if not self._current_application.address:
self._current_application.address = data
else:
self._current_application.address += data
def getResultsByDayMonthYear(self, day, month, year):
self._search_date = search_start_date = datetime.date(year, month, day)
search_end_date = search_start_date + datetime.timedelta(1)
# Now get the search page
get_response = urllib2.urlopen(self.base_url)
soup = BeautifulSoup(get_response.read())
form = soup.find("form", id="__aspnetForm")
# We're going to need __VIEWSTATE for our post
viewstate = form.find("input", {"name":"__VIEWSTATE"})['value']
action = form['action']
# Now we have what we need to do a POST
post_url = urlparse.urljoin(self.base_url, action)
# Example post data without the __VIEWSTATE
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AtxbAppNumber=
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AtxbAddressKeyword=
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstDayStart=30
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstMonthStart=Jul
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstYearStart=2008
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstDayEnd=8
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstMonthEnd=Aug
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstYearEnd=2008
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3ArblDateType=0
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstDistance=
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AtxbPostcode=
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstWards=
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstParishes=
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstOrderBy=RegisterDate+DESC
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3ArblViewType=List
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AbtnQueryPlanApps=Lookup
post_data = urllib.urlencode([
("__VIEWSTATE", viewstate),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:txbAppNumber", ""),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:txbAddressKeyword", ""),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstDayStart", search_start_date.day), # Using the attribute directly to avoid the leading 0
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstMonthStart", search_start_date.strftime("%b")),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstYearStart", search_start_date.strftime("%Y")),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstDayEnd", search_end_date.day), # Using the attribute directly to avoid the leading 0
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstMonthEnd", search_end_date.strftime("%b")),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstYearEnd", search_end_date.strftime("%Y")),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:rblDateType", "0"),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstDistance", ""),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:txbPostcode", ""),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstWards", ""),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstParishes", ""),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstOrderBy", "RegisterDate DESC"),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:rblViewType", "List"),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:btnQueryPlanApps", "Lookup"),
])
post_response = urllib2.urlopen(post_url, post_data)
self.feed(post_response.read())
return self._results
def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
if __name__ == '__main__':
parser = AmberValleyParser()
print parser.getResults(4,8,2008)