소스 검색

Add scraper for Mendip.

Make display method on a planningapplication work out the postcode if it isn't set.
import/raw
duncan.parkes 16 년 전
부모
커밋
887abe9652
4개의 변경된 파일81개의 추가작업 그리고 6개의 파일을 삭제
  1. +71
    -0
      trunk/python_scrapers/Mendip.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +8
    -6
      trunk/python_scrapers/PlanningUtils.py
  4. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 71
- 0
trunk/python_scrapers/Mendip.py 파일 보기

@@ -0,0 +1,71 @@
import urllib2
import urllib
import urlparse

import datetime

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d%%2F%m%%2F%Y"

class MendipParser:
def __init__(self, *args):
self.authority_name = "Mendip District Council"
self.authority_short_name = "Mendip"

# The site itelf uses a search by validated date, but received date seems
# to be there too, and to work...
# self.base_url = "http://www.mendip.gov.uk/PODS/ApplicationSearchResults.asp?DateRecvFrom=&DateRecvTo=&DateValidFrom=%(date)s&DateValidTo=%(date)s&Search=Search"
self.base_url = "http://www.mendip.gov.uk/PODS/ApplicationSearchResults.asp?DateRecvFrom=%(date)s&DateRecvTo=%(date)s&Search=Search"
self.comment_url = "http://www.mendip.gov.uk/ShowForm.asp?fm_fid=107&AppNo=%(reference)s&SiteAddress=%(address)s"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

search_url = self.base_url %{"date": search_date.strftime(date_format)}

while search_url:
response = urllib2.urlopen(search_url)
soup = BeautifulSoup(response.read())

if soup.find(text="No applications matched the search criteria"):
break

for tr in soup.find("table", summary="Application Results").tbody.findAll("tr"):
application = PlanningApplication()
application.date_received = search_date

tds = tr.findAll("td")

application.council_reference = tds[0].a.string.strip()
application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])
application.description = tds[1].p.string.strip()
application.address = tds[2].p.string.strip()

application.comment_url = self.comment_url %{
"reference": application.council_reference,
"address": urllib.quote_plus(application.address),
}

self._results.addApplication(application)

next_link = soup.find("a", title="Go to the next page")
search_url = urlparse.urljoin(self.base_url, next_link['href']) if next_link else None

return self._results


def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = MendipParser()
print parser.getResults(1,10,2008)


+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv 파일 보기

@@ -66,3 +66,4 @@
"Cairngorms.py", "420"
"Calderdale.py", "420"
"Broxtowe.py", "420"
"Mendip.py", "420"

+ 8
- 6
trunk/python_scrapers/PlanningUtils.py 파일 보기

@@ -19,15 +19,14 @@ def fixNewlines(text):

postcode_regex = re.compile("[A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z]")

def getPostcodeFromText(text):
def getPostcodeFromText(text, default_postcode="No Postcode"):
"""This function takes a piece of text and returns the first
bit of it that looks like a postcode."""

postcode_match = postcode_regex.search(text)

if postcode_match is not None:
return postcode_match.group()
return postcode_match.group() if postcode_match else default_postcode


class PlanningAuthorityResults:
"""This class represents a set of results of a planning search.
@@ -69,10 +68,10 @@ class PlanningAuthorityResults:


class PlanningApplication:
def __init__(self, no_postcode_default='No postcode'):
def __init__(self):
self.council_reference = None
self.address = None
self.postcode = no_postcode_default
self.postcode = None
self.description = None
self.info_url = None
self.comment_url = None
@@ -104,6 +103,9 @@ class PlanningApplication:
def displayXML(self):
#print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received

if not self.postcode:
self.postcode = getPostcodeFromText(self.address)

contents = [
u"<council_reference><![CDATA[%s]]></council_reference>" %(self.council_reference),
u"<address><![CDATA[%s]]></address>" %(self.address),


+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv 파일 보기

@@ -272,3 +272,4 @@
"Cairngorms National Park", "Cairngorms", "", "Cairngorms", "CairngormsParser"
"Calderdale Council", "Calderdale", "", "Calderdale", "CalderdaleParser"
"Broxtowe Borough Council", "Broxtowe", "", "Broxtowe", "BroxtoweParser"
"Mendip District Council", "Mendip", "", "Mendip", "MendipParser"

불러오는 중...
취소
저장