소스 검색

Add scraper for Hastings. Sadly, no decent info urls again. Had to use the search page. The real info url is only

accessible with a referer.
import/raw
duncan.parkes 16 년 전
부모
커밋
2bacbbb25a
3개의 변경된 파일87개의 추가작업 그리고 0개의 파일을 삭제
  1. +85
    -0
      trunk/python_scrapers/Hastings.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 85
- 0
trunk/python_scrapers/Hastings.py 파일 보기

@@ -0,0 +1,85 @@
"""
This is the scraper for Hastings.
"""

import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class HastingsParser:
def __init__(self, *args):

self.authority_name = "Hastings Borough Council"
self.authority_short_name = "Hastings"
# self.base_url = "http://www.hastings.gov.uk/planning/view_applications.aspx"
self.base_url = "http://www.hastings.gov.uk/planning/SearchResults.aspx"

# Due to the idiotic design of the Hastings site, we can't give a proper info url.
# There is a sensible URL, but it only works with a referer.
self.info_url = "http://www.hastings.gov.uk/planning/view_applications.aspx"

self.comment_url_template = "http://www.hastings.gov.uk/planning/planningapp_comments.aspx?appNumber=%s&syskey=%s"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

post_data = urllib.urlencode((
("type", "app"),
("time", "0"),
))
# Now get the search page
response = urllib2.urlopen(self.base_url, post_data)
soup = BeautifulSoup(response.read())

caseno_strings = soup.findAll(text="Case No:")

for caseno_string in caseno_strings:
application = PlanningApplication()

application.council_reference = caseno_string.findNext("a").string.strip()
info_url = urlparse.urljoin(self.base_url, caseno_string.findNext("a")['href'])

# See above for why we can't use the proper info url.
application.info_url = self.info_url

# In order to avoid doing a download to find the comment page, we'll
# get the system key from this url

syskey = cgi.parse_qs(urlparse.urlsplit(info_url)[3])['id'][0]

application.date_received = datetime.datetime.strptime(caseno_string.findNext(text="Registration Date:").findNext("p").string.strip(), date_format).date()

application.address = caseno_string.findNext(text="Location:").findNext("p").string.strip()
application.postcode = getPostcodeFromText(application.address)

application.description = caseno_string.findNext(text="Proposal:").findNext("p").string.strip()

#http://www.hastings.gov.uk/planning/planningapp_comments.aspx?appNumber=HS/FA/08/00631&syskey=95642
application.comment_url = self.comment_url_template %(application.council_reference, syskey)

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = HastingsParser()
print parser.getResults(2,9,2008)


+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv 파일 보기

@@ -54,3 +54,4 @@
"Westminster.py", "420"
"Halton.py", "420"
"Hampshire.py", "420"
"Hastings.py", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv 파일 보기

@@ -258,3 +258,4 @@
"Westminster City Council", "Westminster", "", "Westminster", "WestminsterParser"
"Halton Borough Council", "Halton", "", "Halton", "HaltonParser"
"Hampshire County Council", "Hampshire", "", "Hampshire", "HampshireParser"
"Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser"

불러오는 중...
취소
저장