Browse Source

Add scraper for Westminster.

import/raw
duncan.parkes 16 years ago
parent
commit
fa73ab577a
4 changed files with 117 additions and 3 deletions
  1. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  2. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv
  3. +3
    -3
      trunk/python_scrapers/WAM.py
  4. +112
    -0
      trunk/python_scrapers/Westminster.py

+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv View File

@@ -51,3 +51,4 @@
"KingstonUponThames.py", "420" "KingstonUponThames.py", "420"
"Hounslow.py", "420" "Hounslow.py", "420"
"Harrow.py", "420" "Harrow.py", "420"
"Westminster.py", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv View File

@@ -255,3 +255,4 @@
"Royal Borough of Kingston upon Thames", "Kingston upon Thames", "", "KingstonUponThames", "KingstonParser" "Royal Borough of Kingston upon Thames", "Kingston upon Thames", "", "KingstonUponThames", "KingstonParser"
"London Borough of Hounslow", "Hounslow", "", "Hounslow", "HounslowParser" "London Borough of Hounslow", "Hounslow", "", "Hounslow", "HounslowParser"
"London Borough of Harrow", "Harrow", "", "Harrow", "HarrowParser" "London Borough of Harrow", "Harrow", "", "Harrow", "HarrowParser"
"Westminster City Council", "Westminster", "", "Westminster", "WestminsterParser"

+ 3
- 3
trunk/python_scrapers/WAM.py View File

@@ -167,7 +167,7 @@ class BraintreeParser(WAMParser):
if __name__ == '__main__': if __name__ == '__main__':
#parser = BraintreeParser("Braintree", "Braintree", "http://planningapp.braintree.gov.uk/WAM1/weeklyApplications.do", debug=True) #parser = BraintreeParser("Braintree", "Braintree", "http://planningapp.braintree.gov.uk/WAM1/weeklyApplications.do", debug=True)
# Camden # Camden
parser = WAMParser("Castle Point", "Castle Point", "http://wam.castlepoint.gov.uk/WAM/pas/searchApplications.do")#, debug=True)
# parser = WAMParser("Castle Point", "Castle Point", "http://wam.castlepoint.gov.uk/WAM/pas/searchApplications.do")#, debug=True)
#Chichester - Done as PublicAccess #Chichester - Done as PublicAccess
#parser = BraintreeParser("Colchester", "Colchester", "http://www.planning.colchester.gov.uk/WAM/weeklyApplications.do", debug=True) #parser = BraintreeParser("Colchester", "Colchester", "http://www.planning.colchester.gov.uk/WAM/weeklyApplications.do", debug=True)
#parser = WAMParser("East Lothian", "East Lothian", "http://www.planning.eastlothian.gov.uk/WAM/pas/searchApplications.do", debug=True) #parser = WAMParser("East Lothian", "East Lothian", "http://www.planning.eastlothian.gov.uk/WAM/pas/searchApplications.do", debug=True)
@@ -178,9 +178,9 @@ if __name__ == '__main__':
#parser = BraintreeParser("South Gloucestershire", "South Gloucestershire", "http://planning.southglos.gov.uk/WAM/pas/WeeklyApplications.do", debug=True) #parser = BraintreeParser("South Gloucestershire", "South Gloucestershire", "http://planning.southglos.gov.uk/WAM/pas/WeeklyApplications.do", debug=True)
#parser = WAMParser("South Norfolk", "South Norfolk", "http://wam.south-norfolk.gov.uk/WAM/pas/searchApplications.do", debug=True) #parser = WAMParser("South Norfolk", "South Norfolk", "http://wam.south-norfolk.gov.uk/WAM/pas/searchApplications.do", debug=True)
#parser = BraintreeParser("Tower Hamlets", "Tower Hamlets", "http://194.201.98.213/WAM/weeklyApplications.do", debug=True) #parser = BraintreeParser("Tower Hamlets", "Tower Hamlets", "http://194.201.98.213/WAM/weeklyApplications.do", debug=True)
#parser = WAMParser("Westminster", "Westminster", "http://idocs.westminster.gov.uk:8080/WAM/search/pas/index.htm", debug=True)
parser = WAMParser("Westminster", "Westminster", "http://idocs.westminster.gov.uk:8080/WAM/search/pas/index.htm", debug=True)


print parser.getResults(20,5,2008)
print parser.getResults(1,8,2008)


# Left to fix # Left to fix




+ 112
- 0
trunk/python_scrapers/Westminster.py View File

@@ -0,0 +1,112 @@
"""
This is the screenscraper for Westminster City Council.

I have just noticed that there is a PublicAccess underneath all this, but
it only has the apps in for which they are accepting comments, so I think
we may as well use this url and get the lot...

This is the PublicAccess url:
http://publicaccess.westminster.gov.uk/publicaccess/
"""

import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d%%2F%m%%2F%Y"

class WestminsterParser:
def __init__(self, *args):

self.authority_name = "City of Westminster"
self.authority_short_name = "Westminster"
self.base_url = "http://www3.westminster.gov.uk/planningapplications/currentsearch-results.cfm"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# post_data = [
# ("EFNO", ""),
# ("STName", ""),
# ("STNUMB", ""),
# ("ADRSNO", ""),
# ("WARD", "AllWards"),
# ("AGT", ""),
# ("ATCDE", "AllApps"),
# ("DECDE", "AllDecs"),
# ("DTErec", search_day.strftime(date_format)),
# ("DTErecTo", search_day.strftime(date_format)),
# ("DTEvalid", ""),
# ("DTEvalidTo", ""),
# ("APDECDE", "AllAppDecs"),
# ("submit", "Start+Search"),
# ]
post_data = "REFNO=&STName=&STNUMB=&ADRSNO=&WARD=AllWards&AGT=&ATCDE=AllApps&DECDE=AllDecs&DTErec=%(date)s&DTErecTo=%(date)s&DTEvalid=&DTEvalidTo=&APDECDE=AllAppDecs&submit=Start+Search" %{"date": search_day.strftime(date_format)}

while post_data:

# Now get the search page
response = urllib2.urlopen(self.base_url, post_data)
soup = BeautifulSoup(response.read())

results_form = soup.find("form", {"name": "currentsearchresultsNext"})

# Sort out the post_data for the next page, if there is one
# If there is no next page then there will be no inputs in the form.
# In this case, post_data will be '', which is false.

post_data = urllib.urlencode([(x['name'], x['value']) for x in results_form.findAll("input")])

# Each result has one link, and they are the only links in the form

links = results_form.findAll("a")

for link in links:
application = PlanningApplication()

application.date_received = search_day
application.info_url = urlparse.urljoin(self.base_url, link['href'])
application.council_reference = link.string.strip()

application.address = link.findNext("td").string.strip()
application.postcode = getPostcodeFromText(application.address)

application.description = link.findNext("tr").findAll("td")[-1].string.strip()

# To get the comment url, we're going to have to go to each info url :-(

info_response = urllib2.urlopen(application.info_url)
info_soup = BeautifulSoup(info_response)

comment_nav_string = info_soup.find(text="Comment on this case")
if comment_nav_string:
application.comment_url = comment_nav_string.parent['href']
else:
application.comment_url = "No Comments"

#http://publicaccess.westminster.gov.uk/publicaccess/tdc/dcapplication/application_comments_entryform.aspx?caseno=K586GHRP03500

self._results.addApplication(application)


return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = WestminsterParser()
print parser.getResults(1,8,2008)


Loading…
Cancel
Save