Pārlūkot izejas kodu

Add scraper for Mendip.

Make display method on a planningapplication work out the postcode if it isn't set.
import/raw
duncan.parkes pirms 16 gadiem
vecāks
revīzija
887abe9652
4 mainītis faili ar 81 papildinājumiem un 6 dzēšanām
  1. +71
    -0
      trunk/python_scrapers/Mendip.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +8
    -6
      trunk/python_scrapers/PlanningUtils.py
  4. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 71
- 0
trunk/python_scrapers/Mendip.py Parādīt failu

@@ -0,0 +1,71 @@
import urllib2
import urllib
import urlparse

import datetime

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d%%2F%m%%2F%Y"

class MendipParser:
def __init__(self, *args):
self.authority_name = "Mendip District Council"
self.authority_short_name = "Mendip"

# The site itelf uses a search by validated date, but received date seems
# to be there too, and to work...
# self.base_url = "http://www.mendip.gov.uk/PODS/ApplicationSearchResults.asp?DateRecvFrom=&DateRecvTo=&DateValidFrom=%(date)s&DateValidTo=%(date)s&Search=Search"
self.base_url = "http://www.mendip.gov.uk/PODS/ApplicationSearchResults.asp?DateRecvFrom=%(date)s&DateRecvTo=%(date)s&Search=Search"
self.comment_url = "http://www.mendip.gov.uk/ShowForm.asp?fm_fid=107&AppNo=%(reference)s&SiteAddress=%(address)s"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

search_url = self.base_url %{"date": search_date.strftime(date_format)}

while search_url:
response = urllib2.urlopen(search_url)
soup = BeautifulSoup(response.read())

if soup.find(text="No applications matched the search criteria"):
break

for tr in soup.find("table", summary="Application Results").tbody.findAll("tr"):
application = PlanningApplication()
application.date_received = search_date

tds = tr.findAll("td")

application.council_reference = tds[0].a.string.strip()
application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])
application.description = tds[1].p.string.strip()
application.address = tds[2].p.string.strip()

application.comment_url = self.comment_url %{
"reference": application.council_reference,
"address": urllib.quote_plus(application.address),
}

self._results.addApplication(application)

next_link = soup.find("a", title="Go to the next page")
search_url = urlparse.urljoin(self.base_url, next_link['href']) if next_link else None

return self._results


def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = MendipParser()
print parser.getResults(1,10,2008)


+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv Parādīt failu

@@ -66,3 +66,4 @@
"Cairngorms.py", "420"
"Calderdale.py", "420"
"Broxtowe.py", "420"
"Mendip.py", "420"

+ 8
- 6
trunk/python_scrapers/PlanningUtils.py Parādīt failu

@@ -19,15 +19,14 @@ def fixNewlines(text):

postcode_regex = re.compile("[A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z]")

def getPostcodeFromText(text):
def getPostcodeFromText(text, default_postcode="No Postcode"):
"""This function takes a piece of text and returns the first
bit of it that looks like a postcode."""

postcode_match = postcode_regex.search(text)

if postcode_match is not None:
return postcode_match.group()
return postcode_match.group() if postcode_match else default_postcode


class PlanningAuthorityResults:
"""This class represents a set of results of a planning search.
@@ -69,10 +68,10 @@ class PlanningAuthorityResults:


class PlanningApplication:
def __init__(self, no_postcode_default='No postcode'):
def __init__(self):
self.council_reference = None
self.address = None
self.postcode = no_postcode_default
self.postcode = None
self.description = None
self.info_url = None
self.comment_url = None
@@ -104,6 +103,9 @@ class PlanningApplication:
def displayXML(self):
#print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received

if not self.postcode:
self.postcode = getPostcodeFromText(self.address)

contents = [
u"<council_reference><![CDATA[%s]]></council_reference>" %(self.council_reference),
u"<address><![CDATA[%s]]></address>" %(self.address),


+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv Parādīt failu

@@ -272,3 +272,4 @@
"Cairngorms National Park", "Cairngorms", "", "Cairngorms", "CairngormsParser"
"Calderdale Council", "Calderdale", "", "Calderdale", "CalderdaleParser"
"Broxtowe Borough Council", "Broxtowe", "", "Broxtowe", "BroxtoweParser"
"Mendip District Council", "Mendip", "", "Mendip", "MendipParser"

Notiek ielāde…
Atcelt
Saglabāt