ソースを参照

Add scraper for Hastings. Sadly, no decent info urls again. Had to use the search page. The real info url is only

accessible with a referer.
import/raw
duncan.parkes 16年前
コミット
2bacbbb25a
3個のファイルの変更87行の追加0行の削除
  1. +85
    -0
      trunk/python_scrapers/Hastings.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 85
- 0
trunk/python_scrapers/Hastings.py ファイルの表示

@@ -0,0 +1,85 @@
"""
This is the scraper for Hastings.
"""

import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class HastingsParser:
def __init__(self, *args):

self.authority_name = "Hastings Borough Council"
self.authority_short_name = "Hastings"
# self.base_url = "http://www.hastings.gov.uk/planning/view_applications.aspx"
self.base_url = "http://www.hastings.gov.uk/planning/SearchResults.aspx"

# Due to the idiotic design of the Hastings site, we can't give a proper info url.
# There is a sensible URL, but it only works with a referer.
self.info_url = "http://www.hastings.gov.uk/planning/view_applications.aspx"

self.comment_url_template = "http://www.hastings.gov.uk/planning/planningapp_comments.aspx?appNumber=%s&syskey=%s"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

post_data = urllib.urlencode((
("type", "app"),
("time", "0"),
))
# Now get the search page
response = urllib2.urlopen(self.base_url, post_data)
soup = BeautifulSoup(response.read())

caseno_strings = soup.findAll(text="Case No:")

for caseno_string in caseno_strings:
application = PlanningApplication()

application.council_reference = caseno_string.findNext("a").string.strip()
info_url = urlparse.urljoin(self.base_url, caseno_string.findNext("a")['href'])

# See above for why we can't use the proper info url.
application.info_url = self.info_url

# In order to avoid doing a download to find the comment page, we'll
# get the system key from this url

syskey = cgi.parse_qs(urlparse.urlsplit(info_url)[3])['id'][0]

application.date_received = datetime.datetime.strptime(caseno_string.findNext(text="Registration Date:").findNext("p").string.strip(), date_format).date()

application.address = caseno_string.findNext(text="Location:").findNext("p").string.strip()
application.postcode = getPostcodeFromText(application.address)

application.description = caseno_string.findNext(text="Proposal:").findNext("p").string.strip()

#http://www.hastings.gov.uk/planning/planningapp_comments.aspx?appNumber=HS/FA/08/00631&syskey=95642
application.comment_url = self.comment_url_template %(application.council_reference, syskey)

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = HastingsParser()
print parser.getResults(2,9,2008)


+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv ファイルの表示

@@ -54,3 +54,4 @@
"Westminster.py", "420"
"Halton.py", "420"
"Hampshire.py", "420"
"Hastings.py", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv ファイルの表示

@@ -258,3 +258,4 @@
"Westminster City Council", "Westminster", "", "Westminster", "WestminsterParser"
"Halton Borough Council", "Halton", "", "Halton", "HaltonParser"
"Hampshire County Council", "Hampshire", "", "Hampshire", "HampshireParser"
"Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser"

読み込み中…
キャンセル
保存