|
@@ -0,0 +1,93 @@ |
|
|
|
|
|
import urllib2 |
|
|
|
|
|
import urllib |
|
|
|
|
|
import urlparse |
|
|
|
|
|
|
|
|
|
|
|
import datetime, time |
|
|
|
|
|
import cgi |
|
|
|
|
|
|
|
|
|
|
|
import re |
|
|
|
|
|
|
|
|
|
|
|
from BeautifulSoup import BeautifulSoup |
|
|
|
|
|
|
|
|
|
|
|
from PlanningUtils import PlanningApplication, \ |
|
|
|
|
|
PlanningAuthorityResults, \ |
|
|
|
|
|
getPostcodeFromText |
|
|
|
|
|
|
|
|
|
|
|
date_format = "%d/%m/%Y" |
|
|
|
|
|
|
|
|
|
|
|
class FlintshireParser: |
|
|
|
|
|
def __init__(self, *args): |
|
|
|
|
|
|
|
|
|
|
|
self.authority_name = "Flintshire County Council" |
|
|
|
|
|
self.authority_short_name = "Flintshire" |
|
|
|
|
|
|
|
|
|
|
|
# I've removed some extra variables from this, it seems to be happy without them, and now doesn't need to paginate... |
|
|
|
|
|
self.base_url = "http://www.flintshire.gov.uk/webcont/fssplaps.nsf/vwa_Search?searchview&Query=(%%5BfrmDteAppldate%%5D%%20%%3E=%%20%(start_date)s%%20AND%%20%%5BfrmDteAppldate%%5D%%20%%3C=%%20%(end_date)s)" |
|
|
|
|
|
|
|
|
|
|
|
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def getResultsByDayMonthYear(self, day, month, year): |
|
|
|
|
|
search_date = datetime.date(year, month, day) |
|
|
|
|
|
|
|
|
|
|
|
# We'll set the start date to be one day earlier in order to catch the first result on every day at some point - see TODO list |
|
|
|
|
|
response = urllib2.urlopen(self.base_url %{"end_date": search_date.strftime(date_format), |
|
|
|
|
|
"start_date": (search_date - datetime.timedelta(1)).strftime(date_format)}) |
|
|
|
|
|
soup = BeautifulSoup(response.read()) |
|
|
|
|
|
|
|
|
|
|
|
# Each app is stored in it's own table |
|
|
|
|
|
result_tables = soup.findAll("table", border="1") |
|
|
|
|
|
|
|
|
|
|
|
# For the moment, we'll have to ignore the first result (see TODO list). |
|
|
|
|
|
for table in result_tables[1:]: |
|
|
|
|
|
application = PlanningApplication() |
|
|
|
|
|
|
|
|
|
|
|
# It's not clear to me why this next one isn't the string of the next sibling. This works though! |
|
|
|
|
|
application.council_reference = table.find(text=re.compile("Reference")).parent.findNextSibling().contents[0] |
|
|
|
|
|
|
|
|
|
|
|
application.address = table.find(text="Location").parent.findNextSibling().string.strip() |
|
|
|
|
|
application.postcode = getPostcodeFromText(application.address) |
|
|
|
|
|
|
|
|
|
|
|
application.info_url = urlparse.urljoin(self.base_url, table.a['href']) |
|
|
|
|
|
|
|
|
|
|
|
# Let's go to the info_page and get the OSGB and the date_received |
|
|
|
|
|
info_request = urllib2.Request(application.info_url) |
|
|
|
|
|
|
|
|
|
|
|
# We need to add the language header in order to get UK style dates |
|
|
|
|
|
info_request.add_header("Accept-Language", "en-gb,en") |
|
|
|
|
|
info_response = urllib2.urlopen(info_request) |
|
|
|
|
|
info_soup = BeautifulSoup(info_response.read()) |
|
|
|
|
|
|
|
|
|
|
|
grid_reference_td = info_soup.find(text="Grid Reference").findNext("td") |
|
|
|
|
|
x_element = grid_reference_td.font |
|
|
|
|
|
|
|
|
|
|
|
application.osgb_x = x_element.string.strip() |
|
|
|
|
|
application.osgb_y = x_element.nextSibling.nextSibling.string.strip() |
|
|
|
|
|
|
|
|
|
|
|
date_string = info_soup.find(text="Date Valid").findNext("td").string.strip() |
|
|
|
|
|
|
|
|
|
|
|
application.date_received = datetime.datetime(*(time.strptime(date_string, date_format)[0:6])) |
|
|
|
|
|
|
|
|
|
|
|
application.description = table.find(text=re.compile("Description of Proposal")).parent.nextSibling.string.strip() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# There is a link to comment from the info page, though I can't click it. |
|
|
|
|
|
application.comment_url = application.info_url |
|
|
|
|
|
|
|
|
|
|
|
self._results.addApplication(application) |
|
|
|
|
|
|
|
|
|
|
|
return self._results |
|
|
|
|
|
|
|
|
|
|
|
def getResults(self, day, month, year): |
|
|
|
|
|
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
parser = FlintshireParser() |
|
|
|
|
|
print parser.getResults(22,5,2008) |
|
|
|
|
|
|
|
|
|
|
|
# TODO |
|
|
|
|
|
|
|
|
|
|
|
# 1) Email the council about broken first result. |
|
|
|
|
|
# This is always |
|
|
|
|
|
# slightly broken (two </td>s for one of the <td>s and upsets beautiful |
|
|
|
|
|
# soup. |