Browse Source

Add scraper for Mendip.

Make display method on a planningapplication work out the postcode if it isn't set.
master
duncan.parkes 16 years ago
parent
commit
8eaa83b4cf
4 changed files with 81 additions and 6 deletions
  1. +71
    -0
      python_scrapers/Mendip.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +8
    -6
      python_scrapers/PlanningUtils.py
  4. +1
    -0
      python_scrapers/SitesToGenerate.csv

+ 71
- 0
python_scrapers/Mendip.py View File

@@ -0,0 +1,71 @@
import urllib2
import urllib
import urlparse

import datetime

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d%%2F%m%%2F%Y"

class MendipParser:
def __init__(self, *args):
self.authority_name = "Mendip District Council"
self.authority_short_name = "Mendip"

# The site itelf uses a search by validated date, but received date seems
# to be there too, and to work...
# self.base_url = "http://www.mendip.gov.uk/PODS/ApplicationSearchResults.asp?DateRecvFrom=&DateRecvTo=&DateValidFrom=%(date)s&DateValidTo=%(date)s&Search=Search"
self.base_url = "http://www.mendip.gov.uk/PODS/ApplicationSearchResults.asp?DateRecvFrom=%(date)s&DateRecvTo=%(date)s&Search=Search"
self.comment_url = "http://www.mendip.gov.uk/ShowForm.asp?fm_fid=107&AppNo=%(reference)s&SiteAddress=%(address)s"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

search_url = self.base_url %{"date": search_date.strftime(date_format)}

while search_url:
response = urllib2.urlopen(search_url)
soup = BeautifulSoup(response.read())

if soup.find(text="No applications matched the search criteria"):
break

for tr in soup.find("table", summary="Application Results").tbody.findAll("tr"):
application = PlanningApplication()
application.date_received = search_date

tds = tr.findAll("td")

application.council_reference = tds[0].a.string.strip()
application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])
application.description = tds[1].p.string.strip()
application.address = tds[2].p.string.strip()

application.comment_url = self.comment_url %{
"reference": application.council_reference,
"address": urllib.quote_plus(application.address),
}

self._results.addApplication(application)

next_link = soup.find("a", title="Go to the next page")
search_url = urlparse.urljoin(self.base_url, next_link['href']) if next_link else None

return self._results


def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = MendipParser()
print parser.getResults(1,10,2008)


+ 1
- 0
python_scrapers/OtherFilesToCopy.csv View File

@@ -66,3 +66,4 @@
"Cairngorms.py", "420" "Cairngorms.py", "420"
"Calderdale.py", "420" "Calderdale.py", "420"
"Broxtowe.py", "420" "Broxtowe.py", "420"
"Mendip.py", "420"

+ 8
- 6
python_scrapers/PlanningUtils.py View File

@@ -19,15 +19,14 @@ def fixNewlines(text):


postcode_regex = re.compile("[A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z]") postcode_regex = re.compile("[A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z]")


def getPostcodeFromText(text):
def getPostcodeFromText(text, default_postcode="No Postcode"):
"""This function takes a piece of text and returns the first """This function takes a piece of text and returns the first
bit of it that looks like a postcode.""" bit of it that looks like a postcode."""


postcode_match = postcode_regex.search(text) postcode_match = postcode_regex.search(text)


if postcode_match is not None:
return postcode_match.group()
return postcode_match.group() if postcode_match else default_postcode



class PlanningAuthorityResults: class PlanningAuthorityResults:
"""This class represents a set of results of a planning search. """This class represents a set of results of a planning search.
@@ -69,10 +68,10 @@ class PlanningAuthorityResults:




class PlanningApplication: class PlanningApplication:
def __init__(self, no_postcode_default='No postcode'):
def __init__(self):
self.council_reference = None self.council_reference = None
self.address = None self.address = None
self.postcode = no_postcode_default
self.postcode = None
self.description = None self.description = None
self.info_url = None self.info_url = None
self.comment_url = None self.comment_url = None
@@ -104,6 +103,9 @@ class PlanningApplication:
def displayXML(self): def displayXML(self):
#print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received #print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received


if not self.postcode:
self.postcode = getPostcodeFromText(self.address)

contents = [ contents = [
u"<council_reference><![CDATA[%s]]></council_reference>" %(self.council_reference), u"<council_reference><![CDATA[%s]]></council_reference>" %(self.council_reference),
u"<address><![CDATA[%s]]></address>" %(self.address), u"<address><![CDATA[%s]]></address>" %(self.address),


+ 1
- 0
python_scrapers/SitesToGenerate.csv View File

@@ -272,3 +272,4 @@
"Cairngorms National Park", "Cairngorms", "", "Cairngorms", "CairngormsParser" "Cairngorms National Park", "Cairngorms", "", "Cairngorms", "CairngormsParser"
"Calderdale Council", "Calderdale", "", "Calderdale", "CalderdaleParser" "Calderdale Council", "Calderdale", "", "Calderdale", "CalderdaleParser"
"Broxtowe Borough Council", "Broxtowe", "", "Broxtowe", "BroxtoweParser" "Broxtowe Borough Council", "Broxtowe", "", "Broxtowe", "BroxtoweParser"
"Mendip District Council", "Mendip", "", "Mendip", "MendipParser"

Loading…
Cancel
Save