Add scraper for Broxtowe.

pirms 16 gadiem · 42d99b3e7f
--- a/python_scrapers/Broxtowe.py
+++ b/python_scrapers/Broxtowe.py
@@ -0,0 +1,113 @@
 import urllib2
 import urllib
 import urlparse

 import datetime, time
 import cgi

 from BeautifulSoup import BeautifulSoup

 from PlanningUtils import PlanningApplication, \
    PlanningAuthorityResults, \
    getPostcodeFromText

 date_format = "%d/%m/%Y"

 class BroxtoweParser:
    def __init__(self, *args):

        self.authority_name = "Broxtowe Borough Council"
        self.authority_short_name = "Broxtowe"
        self.base_url = "http://planning.broxtowe.gov.uk"

        self.info_url = "http://planning.broxtowe.gov.uk/ApplicationDetail.aspx?RefVal=%s"


        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_day = datetime.date(year, month, day)

        # Now get the search page
        get_response = urllib2.urlopen(self.base_url)
        get_soup = BeautifulSoup(get_response.read())

        # These are the inputs with a default value
        inputs_needed = [(x['id'], x['value']) for x in get_soup.form.findAll("input", value=True, type=lambda x: x != "submit")]

        # Add the submit button
        inputs_needed.append(('cmdWeeklyList', 'Search Database'))

        # We also need to add the date we want to search for.
        # This is the friday after the date searched for.
        # At weekends this will get you the friday before, but that isn't
        # a problem as there are no apps then.
        friday = search_day + datetime.timedelta(4 - search_day.weekday())

        inputs_needed.append(("ddlWeeklyList", friday.strftime(date_format)))

        # We'd like as many results as we can get away with on one page.
        # 50 is the largest option offerend
        inputs_needed.append(("ddlResultsPerPageWeeklyList", "50"))

        post_data = dict(inputs_needed)
        post_url = get_response.url

        # In case something goes wrong here, let's break out of the loop after at most 10 passes
        passes = 0

        while True:
            passes += 1

            post_response = urllib2.urlopen(post_url, urllib.urlencode(post_data))
            post_soup = BeautifulSoup(post_response.read())

            result_tables = post_soup.table.findAll("table")

            for result_table in result_tables:
                application = PlanningApplication()

                application.address = ', '.join(result_table.findPrevious("b").string.strip().split("\r"))
                application.postcode = getPostcodeFromText(application.address)

                trs = result_table.findAll("tr")

                application.council_reference = trs[0].findAll("td")[1].string.strip()
                application.date_received = datetime.datetime.strptime(trs[1].findAll("td")[1].string.strip(), date_format).date()
                application.description = trs[3].findAll("td")[1].string.strip()

                application.info_url = self.info_url %(urllib.quote(application.council_reference))

                # In order to avoid having to do a download for every app,
                # I'm setting the comment url to be the same as the info_url.
                # There is a comment page which can be got to by pressing the button
                application.comment_url = application.info_url

                self._results.addApplication(application)

            # Which page are we on?
            page_no = int(post_soup.find("span", id="lblPageNo").b.string)
            total_pages = int(post_soup.find("span", id="lblTotalPages").b.string)

            if passes > 10 or not page_no < total_pages:
                break

            post_data = [
                ("__EVENTTARGET", "hlbNext"),
                ("__EVENTARGUMENT", ""),
                ("__VIEWSTATE", post_soup.find("input", id="__VIEWSTATE")['value']),
                ("__EVENTVALIDATION", post_soup.find("input", id="__EVENTVALIDATION")['value']),
                 ]

            post_url = urlparse.urljoin(post_response.url, post_soup.find("form")['action'])

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

 if __name__ == '__main__':
    parser = BroxtoweParser()
    print parser.getResults(3,10,2008)

--- a/python_scrapers/OtherFilesToCopy.csv
+++ b/python_scrapers/OtherFilesToCopy.csv
@@ -65,3 +65,4 @@
 "Leicestershire.py", "420"
 "Cairngorms.py", "420"
 "Calderdale.py", "420"
 "Broxtowe.py", "420"
--- a/python_scrapers/SitesToGenerate.csv
+++ b/python_scrapers/SitesToGenerate.csv
@@ -271,3 +271,4 @@
 "Leicestershire County Council", "Leicestershire", "", "Leicestershire", "LeicestershireParser"
 "Cairngorms National Park", "Cairngorms", "", "Cairngorms", "CairngormsParser"
 "Calderdale Council", "Calderdale", "", "Calderdale", "CalderdaleParser"
 "Broxtowe Borough Council", "Broxtowe", "", "Broxtowe", "BroxtoweParser"