Quellcode durchsuchen

Add scraper for Gosport.

Factor out CookieAddingHTTPRedirectHandler.
import/raw
duncan.parkes vor 16 Jahren
Ursprung
Commit
ddc81f06ea
5 geänderte Dateien mit 124 neuen und 10 gelöschten Zeilen
  1. +100
    -0
      trunk/python_scrapers/Gosport.py
  2. +18
    -0
      trunk/python_scrapers/HTTPHandlers.py
  3. +3
    -10
      trunk/python_scrapers/Ocella.py
  4. +2
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  5. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 100
- 0
trunk/python_scrapers/Gosport.py Datei anzeigen

@@ -0,0 +1,100 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

import re

import cookielib

cookie_jar = cookielib.CookieJar()


from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText


from HTTPHandlers import CookieAddingHTTPRedirectHandler

cookie_handling_opener = urllib2.build_opener(CookieAddingHTTPRedirectHandler(cookie_jar))


search_date_format = "%m/%d/%Y" #That's right, the search date is US style.
info_page_date_format = "%d/%m/%Y" # and the info page is UK style

class GosportParser:
def __init__(self, *args):

self.authority_name = "Gosport Borough Council"
self.authority_short_name = "Gosport"

self.base_url = "http://www.gosport.gov.uk/gbcplanning/ApplicationSearch2.aspx"
self.info_url = "http://www.gosport.gov.uk/gbcplanning/ApplicationDetails.aspx?ID=%s"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

get_request = urllib2.Request(self.base_url)
get_response = urllib2.urlopen(get_request)
cookie_jar.extract_cookies(get_response, get_request)
get_soup = BeautifulSoup(get_response.read())

post_data = (
("__VIEWSTATE", get_soup.find("input", {"name": "__VIEWSTATE"})["value"]),
("pgid", get_soup.find("input", {"name": "pgid"})["value"]),
("action", "Search"),
# ("ApplicationSearch21%3AtbDevAddress", ""),
# ("ApplicationSearch21%3AtbApplicantName", ""),
# ("ApplicationSearch21%3AtbAgentName", ""),
("ApplicationSearch21:tbDateSubmitted", "10/01/2008"),
("ApplicationSearch21:btnDateSubmitted", "Search"),
# ("ApplicationSearch21%3AtbDateDetermined", ""),
)

post_request = urllib2.Request(self.base_url, urllib.urlencode(post_data))
cookie_jar.add_cookie_header(post_request)
post_response = cookie_handling_opener.open(post_request)

post_soup = BeautifulSoup(post_response.read())

# Discard the first <tr>, which contains headers
trs = post_soup.find("table", id="SearchResults1_dgSearchResults").findAll("tr")[1:]

for tr in trs:
application = PlanningApplication()
tds = tr.findAll("td")

application.council_reference = tds[0].string.strip()
application.address = tds[1].string.strip()
application.postcode = getPostcodeFromText(application.address)
application.description = tds[2].string.strip()

application.date_received = datetime.datetime(*(time.strptime(tds[3].string.strip(), info_page_date_format)[0:6]))
application.info_url = self.info_url %(application.council_reference)

# The comment url must be accessed by a POST, so we'll just use the info url for that as well

application.comment_url = application.info_url

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = GosportParser()
print parser.getResults(1,10,2008)


+ 18
- 0
trunk/python_scrapers/HTTPHandlers.py Datei anzeigen

@@ -0,0 +1,18 @@

from urllib2 import HTTPRedirectHandler

class CookieAddingHTTPRedirectHandler(HTTPRedirectHandler):
"""The standard python HttpRedirectHandler doesn't add a cookie to the new request after a 302. This handler does."""

def __init__(self, cookie_jar):
self.cookie_jar = cookie_jar

# This really ought to call the superclasses init method, but there doesn't seem to be one.


def redirect_request(self, *args):
new_request = HTTPRedirectHandler.redirect_request(self, *args)
# We need to add a cookie from the cookie_jar
self.cookie_jar.add_cookie_header(new_request)

return new_request

+ 3
- 10
trunk/python_scrapers/Ocella.py Datei anzeigen

@@ -18,16 +18,9 @@ search_date_format = "%d-%m-%Y" # Format used for the accepted date when searchi

possible_date_formats = [search_date_format, "%d/%m/%Y"]

class CookieAddingHTTPRedirectHandler(urllib2.HTTPRedirectHandler):
"""The standard python HttpRedirectHandler doesn't add a cookie to the new request after a 302. This handler does."""
def redirect_request(self, req, fp, code, msg, headers, newurl):
new_request = urllib2.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, newurl)
# We need to add a cookie from the cookie_jar
cookie_jar.add_cookie_header(new_request)
from HTTPHandlers import CookieAddingHTTPRedirectHandler

return new_request

cookie_handling_opener = urllib2.build_opener(CookieAddingHTTPRedirectHandler())
cookie_handling_opener = urllib2.build_opener(CookieAddingHTTPRedirectHandler(cookie_jar))


class OcellaParser:
@@ -206,7 +199,7 @@ if __name__ == '__main__':
# parser = OcellaParser("North East Lincolnshire", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,64104&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Uttlesford", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page/portal/plan/weekly")
# parser = OcellaParser("Bridgend", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL")
parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Castle Point", "Castle Point", "http://planning.castlepoint.gov.uk/portal/page?_pageid=35,38205&_dad=portal&_schema=PORTAL")
parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly")



+ 2
- 0
trunk/python_scrapers/OtherFilesToCopy.csv Datei anzeigen

@@ -1,6 +1,7 @@
"filename", "permissions"
"PublicAccess.py", "420"
"PlanningUtils.py", "420"
"HTTPHandlers.py", "420"
"SouthOxfordshireParser.py", "420"
"SouthOxfordshire.cgi", "493"
"ApplicationSearchServletParser.py", "420"
@@ -58,3 +59,4 @@
"Herefordshire.py", "420"
"Exmoor.py", "420"
"Eastbourne.py", "420"
"Gosport.py", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv Datei anzeigen

@@ -264,3 +264,4 @@
"Eastbourne Borough Council", "Eastbourne", "", "Eastbourne", "EastbourneParser"
"Waltham Forest Council", "Waltham Forest", "http://planning.walthamforest.gov.uk/", "PlanningExplorer", "WalthamForestParser"
"Caerphilly County Borough Council", "Caerphilly", "http://publicaccess.caerphilly.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"Gosport Borough Council", "Gosport", "", "Gosport", "GosportParser"

Laden…
Abbrechen
Speichern