Przeglądaj źródła

Add scraper for Hastings. Sadly, no decent info urls again. Had to use the search page. The real info url is only

accessible with a referer.
master
duncan.parkes 16 lat temu
rodzic
commit
0bfe5dace9
3 zmienionych plików z 87 dodań i 0 usunięć
  1. +85
    -0
      python_scrapers/Hastings.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      python_scrapers/SitesToGenerate.csv

+ 85
- 0
python_scrapers/Hastings.py Wyświetl plik

@@ -0,0 +1,85 @@
"""
This is the scraper for Hastings.
"""

import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class HastingsParser:
def __init__(self, *args):

self.authority_name = "Hastings Borough Council"
self.authority_short_name = "Hastings"
# self.base_url = "http://www.hastings.gov.uk/planning/view_applications.aspx"
self.base_url = "http://www.hastings.gov.uk/planning/SearchResults.aspx"

# Due to the idiotic design of the Hastings site, we can't give a proper info url.
# There is a sensible URL, but it only works with a referer.
self.info_url = "http://www.hastings.gov.uk/planning/view_applications.aspx"

self.comment_url_template = "http://www.hastings.gov.uk/planning/planningapp_comments.aspx?appNumber=%s&syskey=%s"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

post_data = urllib.urlencode((
("type", "app"),
("time", "0"),
))
# Now get the search page
response = urllib2.urlopen(self.base_url, post_data)
soup = BeautifulSoup(response.read())

caseno_strings = soup.findAll(text="Case No:")

for caseno_string in caseno_strings:
application = PlanningApplication()

application.council_reference = caseno_string.findNext("a").string.strip()
info_url = urlparse.urljoin(self.base_url, caseno_string.findNext("a")['href'])

# See above for why we can't use the proper info url.
application.info_url = self.info_url

# In order to avoid doing a download to find the comment page, we'll
# get the system key from this url

syskey = cgi.parse_qs(urlparse.urlsplit(info_url)[3])['id'][0]

application.date_received = datetime.datetime.strptime(caseno_string.findNext(text="Registration Date:").findNext("p").string.strip(), date_format).date()

application.address = caseno_string.findNext(text="Location:").findNext("p").string.strip()
application.postcode = getPostcodeFromText(application.address)

application.description = caseno_string.findNext(text="Proposal:").findNext("p").string.strip()

#http://www.hastings.gov.uk/planning/planningapp_comments.aspx?appNumber=HS/FA/08/00631&syskey=95642
application.comment_url = self.comment_url_template %(application.council_reference, syskey)

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = HastingsParser()
print parser.getResults(2,9,2008)


+ 1
- 0
python_scrapers/OtherFilesToCopy.csv Wyświetl plik

@@ -54,3 +54,4 @@
"Westminster.py", "420"
"Halton.py", "420"
"Hampshire.py", "420"
"Hastings.py", "420"

+ 1
- 0
python_scrapers/SitesToGenerate.csv Wyświetl plik

@@ -258,3 +258,4 @@
"Westminster City Council", "Westminster", "", "Westminster", "WestminsterParser"
"Halton Borough Council", "Halton", "", "Halton", "HaltonParser"
"Hampshire County Council", "Hampshire", "", "Hampshire", "HampshireParser"
"Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser"

Ładowanie…
Anuluj
Zapisz