Procházet zdrojové kódy

Add scraper for Mendip.

Make display method on a planningapplication work out the postcode if it isn't set.
import/raw
duncan.parkes před 16 roky
rodič
revize
887abe9652
4 změnil soubory, kde provedl 81 přidání a 6 odebrání
  1. +71
    -0
      trunk/python_scrapers/Mendip.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +8
    -6
      trunk/python_scrapers/PlanningUtils.py
  4. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 71
- 0
trunk/python_scrapers/Mendip.py Zobrazit soubor

@@ -0,0 +1,71 @@
import urllib2
import urllib
import urlparse

import datetime

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d%%2F%m%%2F%Y"

class MendipParser:
def __init__(self, *args):
self.authority_name = "Mendip District Council"
self.authority_short_name = "Mendip"

# The site itelf uses a search by validated date, but received date seems
# to be there too, and to work...
# self.base_url = "http://www.mendip.gov.uk/PODS/ApplicationSearchResults.asp?DateRecvFrom=&DateRecvTo=&DateValidFrom=%(date)s&DateValidTo=%(date)s&Search=Search"
self.base_url = "http://www.mendip.gov.uk/PODS/ApplicationSearchResults.asp?DateRecvFrom=%(date)s&DateRecvTo=%(date)s&Search=Search"
self.comment_url = "http://www.mendip.gov.uk/ShowForm.asp?fm_fid=107&AppNo=%(reference)s&SiteAddress=%(address)s"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

search_url = self.base_url %{"date": search_date.strftime(date_format)}

while search_url:
response = urllib2.urlopen(search_url)
soup = BeautifulSoup(response.read())

if soup.find(text="No applications matched the search criteria"):
break

for tr in soup.find("table", summary="Application Results").tbody.findAll("tr"):
application = PlanningApplication()
application.date_received = search_date

tds = tr.findAll("td")

application.council_reference = tds[0].a.string.strip()
application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])
application.description = tds[1].p.string.strip()
application.address = tds[2].p.string.strip()

application.comment_url = self.comment_url %{
"reference": application.council_reference,
"address": urllib.quote_plus(application.address),
}

self._results.addApplication(application)

next_link = soup.find("a", title="Go to the next page")
search_url = urlparse.urljoin(self.base_url, next_link['href']) if next_link else None

return self._results


def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = MendipParser()
print parser.getResults(1,10,2008)


+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv Zobrazit soubor

@@ -66,3 +66,4 @@
"Cairngorms.py", "420"
"Calderdale.py", "420"
"Broxtowe.py", "420"
"Mendip.py", "420"

+ 8
- 6
trunk/python_scrapers/PlanningUtils.py Zobrazit soubor

@@ -19,15 +19,14 @@ def fixNewlines(text):

postcode_regex = re.compile("[A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z]")

def getPostcodeFromText(text):
def getPostcodeFromText(text, default_postcode="No Postcode"):
"""This function takes a piece of text and returns the first
bit of it that looks like a postcode."""

postcode_match = postcode_regex.search(text)

if postcode_match is not None:
return postcode_match.group()
return postcode_match.group() if postcode_match else default_postcode


class PlanningAuthorityResults:
"""This class represents a set of results of a planning search.
@@ -69,10 +68,10 @@ class PlanningAuthorityResults:


class PlanningApplication:
def __init__(self, no_postcode_default='No postcode'):
def __init__(self):
self.council_reference = None
self.address = None
self.postcode = no_postcode_default
self.postcode = None
self.description = None
self.info_url = None
self.comment_url = None
@@ -104,6 +103,9 @@ class PlanningApplication:
def displayXML(self):
#print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received

if not self.postcode:
self.postcode = getPostcodeFromText(self.address)

contents = [
u"<council_reference><![CDATA[%s]]></council_reference>" %(self.council_reference),
u"<address><![CDATA[%s]]></address>" %(self.address),


+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv Zobrazit soubor

@@ -272,3 +272,4 @@
"Cairngorms National Park", "Cairngorms", "", "Cairngorms", "CairngormsParser"
"Calderdale Council", "Calderdale", "", "Calderdale", "CalderdaleParser"
"Broxtowe Borough Council", "Broxtowe", "", "Broxtowe", "BroxtoweParser"
"Mendip District Council", "Mendip", "", "Mendip", "MendipParser"

Načítá se…
Zrušit
Uložit