浏览代码

Add scraper for Amber Valley. This was a really irritating one. There is no way to link to individual info pages, so

have had to give the search url in all the info urls. Comment url works fine for each app.
master
duncan.parkes 16 年前
父节点
当前提交
5df7fe27c8
共有 4 个文件被更改,包括 196 次插入1 次删除
  1. +194
    -0
      python_scrapers/AmberValley.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      python_scrapers/SitesToGenerate.csv
  4. +0
    -1
      python_scrapers/WAM.py

+ 194
- 0
python_scrapers/AmberValley.py 查看文件

@@ -0,0 +1,194 @@
"""
This is the screenscraper for the planning applications in Amber Valley.

We have to get the initial search page so that we can use the __VIEWSTATE
parameter.

The start and end dates have to be separated by 1 day - I presume they are
interpreting dates as a datetime at midnight...

BeautifulSoup doesn't seem to be able to cope with what comes back from the
post, so we'll use HTMLParser.

The info reference link uses javascript (typical). As far as I can see there is no way to link directly to the info page for an application, so we'll just have to link to the search page.

Bizarrely, the comment url is fine. e.g.

http://www.ambervalley.gov.uk/services/environment/landandpremises/planningtownandcountry/planningapplications/planappcommentform.htm?frm_AppNum=AVA-2008-0955&frm_SiteAddress=147+Derby+Road%0dDuffield%0dBelper%0dDerbyshire%0dDE56+4FQ%0d&frm_Proposal=Rear+single+storey+extension+and+loft+conversion

"""

import urllib2
import urllib
import urlparse

import HTMLParser

import datetime

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

#date_format = "%d/%m/%Y"

class AmberValleyParser(HTMLParser.HTMLParser):
def __init__(self, *args):

HTMLParser.HTMLParser.__init__(self)

self._in_result_table = False
self._td_count = None
self._get_ref = False
self._get_description = False

self.authority_name = "Amber Valley Borough Council"
self.authority_short_name = "Amber Valley"
self.base_url = "http://www.ambervalley.gov.uk/AVBC/Core/TemplateHandler.aspx?NRMODE=Published&NRNODEGUID=%7bAF862CF0-5C6D-4115-9979-5956B24D12DF%7d&NRORIGINALURL=%2fservices%2fenvironment%2flandandpremises%2fplanningtownandcountry%2fplanningapplications%2fPlanningApplicationRegister%2ehtm&NRCACHEHINT=Guest#filterbottom"
self.comment_url_template = "http://www.ambervalley.gov.uk/services/environment/landandpremises/planningtownandcountry/planningapplications/planappcommentform.htm?frm_AppNum=%(reference)s&frm_SiteAddress=%(address)s&frm_Proposal=%(description)s"

self._current_application = None
self._search_date = None

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def handle_starttag(self, tag, attrs):
if tag == "table":
for key, value in attrs:
if key == "class" and value == "test":
self._current_application = PlanningApplication()
# We can set the date_received immediately
self._current_application.date_received = self._search_date

self._in_result_table = True
self._td_count = 0

break

elif tag == "td":
if self._in_result_table:
self._td_count += 1
self._get_description = False
elif tag == "a" and self._td_count == 1:
self._get_ref = True

def handle_endtag(self, tag):
if tag == "table" and self._in_result_table:
self._current_application.description = self._current_application.description.strip()
self._current_application.address = ' '.join(self._current_application.address.strip().split())
self._current_application.postcode = getPostcodeFromText(self._current_application.address)
self._current_application.info_url = self.base_url # Can't link to the info page, due to javascript idiocy.
self._current_application.comment_url = self.comment_url_template %{"reference": urllib.quote_plus(self._current_application.council_reference),
"address": urllib.quote_plus(self._current_application.address),
"description": urllib.quote_plus(self._current_application.description),
}
self._results.addApplication(self._current_application)

self._in_result_table = False
self._td_count = None

if tag == "a":
self._get_ref = False

def handle_startendtag(self, tag, attrs):
if tag == "br" and self._td_count == 2:
self._get_description = True

def handle_data(self, data):
if self._get_ref == True:
self._current_application.council_reference = data

elif self._td_count == 2:
# This td contains the address (including postcode)
# and the description

if self._get_description:
# We have passed the <br />, and are looking for the description
if not self._current_application.description:
self._current_application.description = data
else:
self._current_application.description += data
else:
# We have not yet passed the <br /> and are looking for the address and postcode.
if not self._current_application.address:
self._current_application.address = data
else:
self._current_application.address += data


def getResultsByDayMonthYear(self, day, month, year):
self._search_date = search_start_date = datetime.date(year, month, day)
search_end_date = search_start_date + datetime.timedelta(1)

# Now get the search page
get_response = urllib2.urlopen(self.base_url)

soup = BeautifulSoup(get_response.read())

form = soup.find("form", id="__aspnetForm")

# We're going to need __VIEWSTATE for our post
viewstate = form.find("input", {"name":"__VIEWSTATE"})['value']
action = form['action']

# Now we have what we need to do a POST
post_url = urlparse.urljoin(self.base_url, action)

# Example post data without the __VIEWSTATE

# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AtxbAppNumber=
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AtxbAddressKeyword=
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstDayStart=30
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstMonthStart=Jul
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstYearStart=2008
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstDayEnd=8
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstMonthEnd=Aug
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstYearEnd=2008
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3ArblDateType=0
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstDistance=
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AtxbPostcode=
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstWards=
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstParishes=
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AlstOrderBy=RegisterDate+DESC
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3ArblViewType=List
# MainControl%3ACustomFunctionality_ZoneMain%3AEmbeddedUserControlPlaceholderControl1%3A_ctl0%3AmyFilter%3AbtnQueryPlanApps=Lookup

post_data = urllib.urlencode([
("__VIEWSTATE", viewstate),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:txbAppNumber", ""),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:txbAddressKeyword", ""),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstDayStart", search_start_date.day), # Using the attribute directly to avoid the leading 0
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstMonthStart", search_start_date.strftime("%b")),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstYearStart", search_start_date.strftime("%Y")),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstDayEnd", search_end_date.day), # Using the attribute directly to avoid the leading 0
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstMonthEnd", search_end_date.strftime("%b")),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstYearEnd", search_end_date.strftime("%Y")),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:rblDateType", "0"),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstDistance", ""),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:txbPostcode", ""),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstWards", ""),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstParishes", ""),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:lstOrderBy", "RegisterDate DESC"),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:rblViewType", "List"),
("MainControl:CustomFunctionality_ZoneMain:EmbeddedUserControlPlaceholderControl1:_ctl0:myFilter:btnQueryPlanApps", "Lookup"),
])

post_response = urllib2.urlopen(post_url, post_data)
self.feed(post_response.read())

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = AmberValleyParser()
print parser.getResults(4,8,2008)


+ 1
- 0
python_scrapers/OtherFilesToCopy.csv 查看文件

@@ -42,3 +42,4 @@
"NorthAyrshire.cgi", "493"
"Redbridge.pl", "493"
"Redbridge.cgi", "493"
"AmberValley.py", "420"

+ 1
- 0
python_scrapers/SitesToGenerate.csv 查看文件

@@ -246,3 +246,4 @@
"Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/", "PlanningExplorer", "ConwyParser"
"London Borough of Merton", "Merton", "http://planning.merton.gov.uk", "PlanningExplorer", "MertonParser"
"London Borough of Enfield", "Enfield", "http://forms.enfield.gov.uk/swiftlg/apas/run/wphappcriteria.display", "SwiftLG", "SwiftLGParser"
"Amber Valley Borough Council", "Amber Valley", "", "AmberValley", "AmberValleyParser"

+ 0
- 1
python_scrapers/WAM.py 查看文件

@@ -165,7 +165,6 @@ class BraintreeParser(WAMParser):

if __name__ == '__main__':
#parser = WAMParser("Barking and Dagenham", "Barking and Dagenham", "http://idoxwam.lbbd.gov.uk:8081/WAM/pas/searchApplications.do", debug=True)
#parser = BraintreeParser("Braintree", "Braintree", "http://planningapp.braintree.gov.uk/WAM1/weeklyApplications.do", debug=True)
# Camden
parser = WAMParser("Castle Point", "Castle Point", "http://wam.castlepoint.gov.uk/WAM/pas/searchApplications.do")#, debug=True)


正在加载...
取消
保存