Преглед изворни кода

Add Berwick scraper.

master
duncan.parkes пре 17 година
родитељ
комит
fed099cffb
3 измењених фајлова са 84 додато и 0 уклоњено
  1. +82
    -0
      python_scrapers/Berwick.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      python_scrapers/SitesToGenerate.csv

+ 82
- 0
python_scrapers/Berwick.py Прегледај датотеку

@@ -0,0 +1,82 @@

import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

search_date_format = "%d%m%y"
reg_date_format = "%d/%m/%y"

class BerwickParser:
comments_email_address = "planning@berwick-upon-tweed.gov.uk"

def __init__(self, *args):

self.authority_name = "Berwick-upon-Tweed Borough Council"
self.authority_short_name = "Berwick"
self.base_url = "http://www.berwick-upon-tweed.gov.uk/planning/register/wl/%s.htm"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

monday_before = search_day - datetime.timedelta(search_day.weekday())

thursday = monday_before + datetime.timedelta(3)
if search_day.weekday() > 3: # i.e. It is friday, saturday, or sunday
# We need to add a week
thursday = thursday + datetime.timedelta(7)

this_url = self.base_url %(thursday.strftime(search_date_format))
# Now get the search page
response = urllib2.urlopen(this_url)
soup = BeautifulSoup(response.read())

# Each app is stored in a table of its own. The tables don't have
# any useful attributes, so we'll find all the NavigableString objects
# which look like " Application Number:" and then look at the
#tables they are in.

nav_strings = soup.findAll(text=" Application Number:")

for nav_string in nav_strings:
application = PlanningApplication()

application.council_reference = nav_string.findNext("p").string.strip()

result_table = nav_string.findPrevious("table")

application.date_received = datetime.datetime.strptime(result_table.find(text=" Registration Date: ").findNext("p").contents[0].strip(), reg_date_format)

application.osgb_x = result_table.find(text=" Easting:").findNext("p").string.strip()
application.osgb_y = result_table.find(text=" Northing:").findNext("p").string.strip()

application.description = result_table.find(text=" Proposed Development:").findNext("p").string.strip()
application.address = result_table.find(text=" Location:").findNext("p").string.strip()
application.postcode = getPostcodeFromText(application.address)

application.info_url = this_url

application.comment_url = self.comments_email_address

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = BerwickParser()
print parser.getResults(21,5,2008)


+ 1
- 0
python_scrapers/OtherFilesToCopy.csv Прегледај датотеку

@@ -46,3 +46,4 @@
"Aberdeenshire.py", "420" "Aberdeenshire.py", "420"
"Brent.py", "420" "Brent.py", "420"
"Carmarthenshire.py", "420" "Carmarthenshire.py", "420"
"Berwick.py", "420"

+ 1
- 0
python_scrapers/SitesToGenerate.csv Прегледај датотеку

@@ -250,3 +250,4 @@
"Aberdeenshire Council", "Aberdeenshire", "", "Aberdeenshire", "AberdeenshireParser" "Aberdeenshire Council", "Aberdeenshire", "", "Aberdeenshire", "AberdeenshireParser"
"London Borough of Brent", "Brent", "", "Brent", "BrentParser" "London Borough of Brent", "Brent", "", "Brent", "BrentParser"
"Carmarthenshire County Council", "Carmarthenshire", "", "Carmarthenshire", "CarmarthenshireParser" "Carmarthenshire County Council", "Carmarthenshire", "", "Carmarthenshire", "CarmarthenshireParser"
"Berwick-upon-Tweed Borough Council", "Berwick", "", "Berwick", "BerwickParser"

Loading…
Откажи
Сачувај