Quellcode durchsuchen

Add scraper for Hastings. Sadly, no decent info urls again. Had to use the search page. The real info url is only

accessible with a referer.
master
duncan.parkes vor 16 Jahren
Ursprung
Commit
0bfe5dace9
3 geänderte Dateien mit 87 neuen und 0 gelöschten Zeilen
  1. +85
    -0
      python_scrapers/Hastings.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      python_scrapers/SitesToGenerate.csv

+ 85
- 0
python_scrapers/Hastings.py Datei anzeigen

@@ -0,0 +1,85 @@
"""
This is the scraper for Hastings.
"""

import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class HastingsParser:
def __init__(self, *args):

self.authority_name = "Hastings Borough Council"
self.authority_short_name = "Hastings"
# self.base_url = "http://www.hastings.gov.uk/planning/view_applications.aspx"
self.base_url = "http://www.hastings.gov.uk/planning/SearchResults.aspx"

# Due to the idiotic design of the Hastings site, we can't give a proper info url.
# There is a sensible URL, but it only works with a referer.
self.info_url = "http://www.hastings.gov.uk/planning/view_applications.aspx"

self.comment_url_template = "http://www.hastings.gov.uk/planning/planningapp_comments.aspx?appNumber=%s&syskey=%s"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

post_data = urllib.urlencode((
("type", "app"),
("time", "0"),
))
# Now get the search page
response = urllib2.urlopen(self.base_url, post_data)
soup = BeautifulSoup(response.read())

caseno_strings = soup.findAll(text="Case No:")

for caseno_string in caseno_strings:
application = PlanningApplication()

application.council_reference = caseno_string.findNext("a").string.strip()
info_url = urlparse.urljoin(self.base_url, caseno_string.findNext("a")['href'])

# See above for why we can't use the proper info url.
application.info_url = self.info_url

# In order to avoid doing a download to find the comment page, we'll
# get the system key from this url

syskey = cgi.parse_qs(urlparse.urlsplit(info_url)[3])['id'][0]

application.date_received = datetime.datetime.strptime(caseno_string.findNext(text="Registration Date:").findNext("p").string.strip(), date_format).date()

application.address = caseno_string.findNext(text="Location:").findNext("p").string.strip()
application.postcode = getPostcodeFromText(application.address)

application.description = caseno_string.findNext(text="Proposal:").findNext("p").string.strip()

#http://www.hastings.gov.uk/planning/planningapp_comments.aspx?appNumber=HS/FA/08/00631&syskey=95642
application.comment_url = self.comment_url_template %(application.council_reference, syskey)

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = HastingsParser()
print parser.getResults(2,9,2008)


+ 1
- 0
python_scrapers/OtherFilesToCopy.csv Datei anzeigen

@@ -54,3 +54,4 @@
"Westminster.py", "420"
"Halton.py", "420"
"Hampshire.py", "420"
"Hastings.py", "420"

+ 1
- 0
python_scrapers/SitesToGenerate.csv Datei anzeigen

@@ -258,3 +258,4 @@
"Westminster City Council", "Westminster", "", "Westminster", "WestminsterParser"
"Halton Borough Council", "Halton", "", "Halton", "HaltonParser"
"Hampshire County Council", "Hampshire", "", "Hampshire", "HampshireParser"
"Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser"

Laden…
Abbrechen
Speichern