diff --git a/python_scrapers/CGITemplate b/python_scrapers/CGITemplate new file mode 100644 index 0000000..e72f31e --- /dev/null +++ b/python_scrapers/CGITemplate @@ -0,0 +1,29 @@ +# This is the parser for %(authority_name)s. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "%(authority_name)s" +authority_short_name = "%(authority_short_name)s" +base_url = "%(base_url)s" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/python_scrapers/PlanningUtils.py b/python_scrapers/PlanningUtils.py new file mode 100644 index 0000000..3430576 --- /dev/null +++ b/python_scrapers/PlanningUtils.py @@ -0,0 +1,101 @@ +__auth__ = None + +import re + +date_format = "%d/%m/%Y" + + +def xmlQuote(text): + # Change &s to &s + # I suspect there is probably some standard python + # function I should be using for this... + return text.replace('&', '&') + +def fixNewlines(text): + # This can be used to sort out windows newlines + return text.replace("\r\n","\n") + +# So what can a postcode look like then? +# This list of formats comes from http://www.mailsorttechnical.com/frequentlyaskedquestions.cfm +#AN NAA M1 1AA +#ANN NAA M60 1NW +#AAN NAA CR2 6XH +#AANN NAA DN55 1PT +#ANA NAA W1A 1HP +#AANA NAA EC1A 1BB + +postcode_regex = re.compile("[A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z]") + +def getPostcodeFromText(text): + """This function takes a piece of text and returns the first + bit of it that looks like a postcode.""" + + postcode_match = postcode_regex.search(text) + + if postcode_match is not None: + return postcode_match.group() + + +class PlanningAuthorityResults: + """This class represents a set of results of a planning search. + + This should probably be separated out so that it can be used for + authorities other than Cherwell. + """ + + def __init__(self, authority_name, authority_short_name): + self.authority_name = authority_name + self.authority_short_name = authority_short_name + + # this will be a list of PlanningApplication objects + self.planning_applications = [] + + + def addApplication(self, application): + self.planning_applications.append(application) + + def __repr__(self): + return self.displayXML() + + def displayXML(self): + """This should display the contents of this object in the planningalerts format. + i.e. in the same format as this one: + http://www.planningalerts.com/lambeth.xml + """ + + applications_bit = "".join([x.displayXML() for x in self.planning_applications]) + + return "\n" +\ + "%s\n" %self.authority_name +\ + "%s\n" %self.authority_short_name +\ + "\n" + applications_bit +\ + "\n" +\ + "\n" + + + +class PlanningApplication: + def __init__(self, no_postcode_default='No postcode'): + self.council_reference = None + self.address = None + self.postcode = no_postcode_default + self.description = None + self.info_url = None + self.comment_url = None + + # expecting this as a datetime.date object + self.date_received = None + + def __repr__(self): + return self.displayXML() + + def displayXML(self): + return "\n" +\ + "%s\n" %xmlQuote(self.council_reference) +\ + "
%s
\n" %xmlQuote(self.address) +\ + "%s\n" %self.postcode +\ + "%s\n" %xmlQuote(self.description) +\ + "%s\n" %xmlQuote(self.info_url) +\ + "%s\n" %xmlQuote(self.comment_url) +\ + "%s\n" %self.date_received.strftime(date_format) +\ + "
\n" diff --git a/python_scrapers/PublicAccess.py b/python_scrapers/PublicAccess.py new file mode 100644 index 0000000..db635be --- /dev/null +++ b/python_scrapers/PublicAccess.py @@ -0,0 +1,341 @@ +#!/usr/bin/python + +import urllib, urllib2 +import HTMLParser +import urlparse +import datetime, time + +import cookielib + +cookie_jar = cookielib.CookieJar() + + +from PlanningUtils import fixNewlines, PlanningAuthorityResults, PlanningApplication + + +search_form_url_end = "tdc/DcApplication/application_searchform.aspx" +search_results_url_end = "tdc/DcApplication/application_searchresults.aspx" +comments_url_end = "tdc/DcApplication/application_comments_entryform.aspx" + +class PublicAccessParser(HTMLParser.HTMLParser): + """This is the class which parses the PublicAccess search results page. + """ + + def __init__(self, + authority_name, + authority_short_name, + base_url, + debug=False): + + HTMLParser.HTMLParser.__init__(self) + + self.authority_name = authority_name + self.authority_short_name = authority_short_name + self.base_url = base_url + + self.debug = debug + + # this will change to True when we enter the table of results + self._in_results_table = False + + # this will be set to True when we have passed the header row + # in the results table + self._past_header_row = False + + # this will be true when we are in a in the results table + self._in_td = False + + # For each row, this will say how many tds we have seen so far + self._td_count = 0 + + # The object which stores our set of planning application results + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + # This will store the planning application we are currently working on. + self._current_application = None + + def handle_starttag(self, tag, attrs): + if tag == "table": + self.handle_start_table(attrs) + # we are only interested in tr tags if we are in the results table + elif self._in_results_table and tag == "tr": + self.handle_start_tr(attrs) + # we are only interested in td tags if we are in the results table + elif self._in_results_table and tag == "td": + self.handle_start_td(attrs) + # we are only interested in tags if we are in the 6th td in + # the results table. + elif self._in_td and self._td_count == 6 and tag == "a": + self.handle_start_a(attrs) + # If the tag is not one of these then we aren't interested + + def handle_endtag(self, tag): + # we only need to consider end tags if we are in the results table + if self._in_results_table: + if tag == "table": + self.handle_end_table() + if tag == "tr": + self.handle_end_tr() + if tag == "td": + self.handle_end_td() + + def handle_start_table(self, attrs): + for attr,value in attrs: + if attr == "class": + if value == "cResultsForm": + self._in_results_table = True + break + + def handle_end_table(self): + # If we see an end table tag, then note that we have left the + # results table. This method is only called if we are in that table. + self._in_results_table = False + + + def handle_start_tr(self, attrs): + # The first tr we meet in the results table is just headers + # We will set a flag at the end of that tr to avoid creating + # a blank PlanningApplication + if self._past_header_row: + # Create a candidate result object + self._current_application = PlanningApplication() + self._td_count = 0 + + def handle_end_tr(self): + # If we are in the results table, and not finishing the header row + # append the current result to the results list. + if self._past_header_row: + self._results.addApplication(self._current_application) + else: + # The first row of the results table is headers + # We want to do nothing until after it + self._past_header_row = True + + def handle_start_td(self, attrs): + # increase the td count by one + self._td_count += 1 + + # note that we are now in a td + self._in_td = True + + def handle_end_td(self): + # note that we are now not in a td + self._in_td = False + + def handle_start_a(self, attrs): + # this method is only getting called if we are in the + # 6th td of a non-header row of the results table. + + # go through the attributes of the looking for one + # named 'href' + for attr,value in attrs: + if attr == "href": + # the value of this tag is a relative url. + # parse it so we can get the query string from it + parsed_info_url = urlparse.urlparse(value) + + # the 4th part of the tuple is the query string + query_string = parsed_info_url[4] + + # join this query string to the search URL, and store this as + # the info URL of the current planning application + self._current_application.info_url = urlparse.urljoin(self.base_url, value) + + # Join this query string to the comments URL, and store this as + # the comments URL of the current planning application + comments_url = urlparse.urljoin(self.base_url, comments_url_end) + self._current_application.comment_url = urlparse.urljoin(comments_url, query_string) + + # while we're here, let's follow some links to find the postcode... + # the postcode is in an input tag in the property page. This page + # can be found by following the info url. + # The newlines in the info page need fixing. + info_file_contents = fixNewlines(urllib2.urlopen(self._current_application.info_url).read()) + + info_file_parser = PublicAccessInfoPageParser() + info_file_parser.feed(info_file_contents) + + property_page_url = urlparse.urljoin(self._current_application.info_url, info_file_parser.property_page_url) + + # the newlines in this page need fixing + property_file_contents = fixNewlines(urllib2.urlopen(property_page_url).read()) + + property_file_parser = PublicAccessPropertyPageParser() + property_file_parser.feed(property_file_contents) + + # Set the postcode on the current planning application from the + # one found on the property page + if property_file_parser.postcode is not None: + self._current_application.postcode = property_file_parser.postcode + + # There is no need for us to look at any more attributes. + break + + + def handle_data(self, data): + if self._in_td: + # The first td contains the reference + if self._td_count == 1: + self._current_application.council_reference = data + + # The second td contains the date the application was received + elif self._td_count == 2: + year, month, day = time.strptime(data, "%d/%m/%Y")[:3] + received_date = datetime.date(year, month, day) + + self._current_application.date_received = received_date + + # The third td contains the address + elif self._td_count == 3: + #data = data.replace("^M","\n") + self._current_application.address = data + + # The fourth td contains the description + elif self._td_count == 4: + self._current_application.description = data + # 5 is status - we don't need it. + # 6 is a button - this is where we will get our postcode, + # comment_url, and info_url from (when handling the tag). + + + def getResultsByDayMonthYear(self, day, month, year): + # First download the search form (in order to get a session cookie + search_form_request = urllib2.Request(urlparse.urljoin(self.base_url, search_form_url_end)) + search_form_response = urllib2.urlopen(search_form_request) + + cookie_jar.extract_cookies(search_form_response, search_form_request) + + + # We are only doing this first search in order to get a cookie + # The paging on the site doesn't work with cookies turned off. + + search_data1 = urllib.urlencode({"searchType":"ADV", + "caseNo":"", + "PPReference":"", + "AltReference":"", + "srchtype":"", + "srchstatus":"", + "srchdecision":"", + "srchapstatus":"", + "srchappealdecision":"", + "srchwardcode":"", + "srchparishcode":"", + "srchagentdetails":"", + "srchDateReceivedStart":"%(day)02d/%(month)02d/%(year)d" %{"day":day ,"month": month ,"year": year}, + "srchDateReceivedEnd":"%(day)02d/%(month)02d/%(year)d" %{"day":day, "month":month, "year":year} }) + + if self.debug: + print search_data1 + + + search_url = urlparse.urljoin(self.base_url, search_results_url_end) + request1 = urllib2.Request(search_url, search_data1) + cookie_jar.add_cookie_header(request1) + response1 = urllib2.urlopen(request1) + + # This search is the one we will actually use. + # a maximum of 100 results are returned on this site, + # hence setting "pagesize" to 100. I doubt there will ever + # be more than 100 in one day in PublicAccess... + # "currentpage" = 1 gets us to the first page of results + # (there will only be one anyway, as we are asking for 100 results...) + +#http://planning.york.gov.uk/PublicAccess/tdc/DcApplication/application_searchresults.aspx?szSearchDescription=Applications%20received%20between%2022/02/2007%20and%2022/02/2007&searchType=ADV&bccaseno=¤tpage=2&pagesize=10&module=P3 + + search_data2 = urllib.urlencode((("szSearchDescription","Applications received between %(day)02d/%(month)02d/%(year)d and %(day)02d/%(month)02d/%(year)d"%{"day":day ,"month": month ,"year": year}), ("searchType","ADV"), ("bccaseno",""), ("currentpage","1"), ("pagesize","100"), ("module","P3"))) + + if self.debug: + print search_data2 + + # This time we want to do a get request, so add the search data into the url + request2_url = urlparse.urljoin(self.base_url, search_results_url_end + "?" + search_data2) + + request2 = urllib2.Request(request2_url) + + # add the cookie we stored from our first search + cookie_jar.add_cookie_header(request2) + + response2 = urllib2.urlopen(request2) + + contents = fixNewlines(response2.read()) + + if self.debug: + print contents + + self.feed(contents) + + return self._results + + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + + + + +class PublicAccessInfoPageParser(HTMLParser.HTMLParser): + """A parser to get the URL for the property details page out of the + info page (this url is needed in order to get the postcode of the + application. + """ + + def __init__(self): + HTMLParser.HTMLParser.__init__(self) + + self.property_page_url = None + + def handle_starttag(self, tag, attrs): + """The URL of the property details page is contained in an tag in + an attribute with key 'A_btnPropertyDetails'. There is some garbage on + either side of it which we will have to clear up before storing it... + + We go through the tags looking for one with an attribute with + key 'id' and value 'A_btnPropertyDetails'. When we find it we go through + its attributes looking for one with key 'href' - the value of this attribute + contains the URL we want, after a bit of tidying up. + + Once we have got the URL, there is no need for us to look at any more tags. + """ + if tag == "a" and self.property_page_url is None: + if attrs.count(("id","A_btnPropertyDetails")) > 0: + for attr,value in attrs: + if attr == "href": + the_link = value + + # this has some garbage on either side of it... + # let's strip that off + + # the garbage on the left is separated by whitespace. + # the garbage on the right is separated by a "'". + + self.property_page_url = the_link.split()[1].split("'")[0] + + + + +class PublicAccessPropertyPageParser(HTMLParser.HTMLParser): + """A parser to get the postcode out of the property details page.""" + def __init__(self): + HTMLParser.HTMLParser.__init__(self) + + self.postcode = None + + def handle_starttag(self, tag, attrs): + """The postcode is contained in an tag. + This tag has an attribute 'name' with value postcode. + It also has an attribute 'value' with value the postcode of this application. + + We go through the input tags looking for one with an attribute with + key 'name' and value 'postcode'. When we find one, + we look through its attributes for one with key 'value' - we store the value of this + attribute as self.postcode. + + Once we have the postcode, there is no need to look at any more input tags. + """ + + if tag == "input" and self.postcode is None: + if attrs.count(("name","postcode")) > 0: + for attr,value in attrs: + if attr == "value": + self.postcode = value + diff --git a/python_scrapers/PublicAccessSites.csv b/python_scrapers/PublicAccessSites.csv new file mode 100644 index 0000000..a604ba3 --- /dev/null +++ b/python_scrapers/PublicAccessSites.csv @@ -0,0 +1,29 @@ +"authority_name", "authority_short_name", "base_url" +"City of York Council", "York", "http://planning.york.gov.uk/PublicAccess/" +"Cherwell District Council", "Cherwell", "http://cherweb.cherwell-dc.gov.uk/publicaccess/" +"Angus Council", "Angus", "http://planning.angus.gov.uk/PublicAccess/" +"Huntingdonshire District Council", "Huntingdonshire", "http://planning.huntsdc.gov.uk/publicaccess/" +"South Staffordshire Council", "South Staffs", "https://services.sstaffs.gov.uk/PublicAccess/" +"Bexley Council", "Bexley", "http://publicaccess.bexley.gov.uk/publicaccess/" +"Lancaster City Council", "Lancaster", "http://planapps.lancaster.gov.uk/PublicAccess/" +"Bristol City Council", "Bristol", "http://e2eweb.bristol-city.gov.uk/publicaccess/" +"Portsmouth City Council", "Portsmouth", "http://planning.portsmouth.gov.uk/PublicAccess/" +"The Borough of Oadby and Wigston", "Oadby and Wigston", "http://web.owbc.net/PublicAccess/" +"Test Valley Borough Council", "Test Valley", "http://publicaccess.testvalley.gov.uk/publicaccess/" +"Kings Lynn and West Norfolk Borough Council", "West Norfolk", "http://online.west-norfolk.gov.uk/publicaccess/" +"Sunderland City Council", "Sunderland", "http://www.sunderland.gov.uk/publicaccess/" +"Southampton City Council", "Southampton", "http://publicaccess.southampton.gov.uk/publicaccess/" +"Bath and North East Somerset", "Bath", "http://planning.bathnes.gov.uk/publicaccess/" +"Buckinghamshire County Council", "Buckinghamshire", "http://www.bucksplanning.gov.uk/PublicAccess/" +"Spelthorne Borough Council", "Spelthorne", "http://phoenix.spelthorne.gov.uk/PublicAccess/" +"Stevenage Borough Council", "Stevenage", "http://publicaccess.stevenage.gov.uk/publicaccess/" +"Tonbridge and Malling Borough Council", "Tonbridge", "http://publicaccess.tmbc.gov.uk/publicaccess/" +"Hart District Council", "Hart", "http://publicaccess.hart.gov.uk/publicaccess/" +"Luton Borough Council", "Luton", "http://www.eplan.luton.gov.uk/PublicAccess/" +"Rushmoor Borough Council", "Rushmoor", "http://pa-dc.rushmoor.gov.uk/publicaccess/" +"Blaby District Council", "Blaby", "http://www.blaby.gov.uk/PublicAccess/" +"East Devon District Council", "East Devon", "http://planning.eastdevon.gov.uk/PublicAccess/" +"Mid Devon District Council", "Mid Devon", "http://planning.middevon.gov.uk/publicaccess/" +"Sevenoaks District Council", "Sevenoaks", "http://publicaccess.sevenoaks.gov.uk/publicaccess/" +"Woking Borough Council", "Woking", "http://caps.woking.gov.uk/publicaccess/" +"Basildon District Council", "Basildon", "http://planning.basildon.gov.uk/publicaccess/" \ No newline at end of file diff --git a/python_scrapers/SouthOxfordshire.cgi b/python_scrapers/SouthOxfordshire.cgi new file mode 100755 index 0000000..6e0322a --- /dev/null +++ b/python_scrapers/SouthOxfordshire.cgi @@ -0,0 +1,20 @@ +#!/usr/bin/python + +import cgi +import cgitb; cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + +from SouthOxfordshireParser import SouthOxfordshireParser + +parser = SouthOxfordshireParser() + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/python_scrapers/SouthOxfordshireParser.py b/python_scrapers/SouthOxfordshireParser.py new file mode 100644 index 0000000..0097ee5 --- /dev/null +++ b/python_scrapers/SouthOxfordshireParser.py @@ -0,0 +1,248 @@ + +import urllib, urllib2 + +import HTMLParser +import urlparse +import datetime, time + +# This needs a page number inserting +search_url = "http://www.southoxon.gov.uk/ccm/planning/ApplicationList.jsp?PAGE=%d" + +# This needs the council reference +comment_url = "https://forms.southoxon.gov.uk/ufs/ufsmain?formid=PLANNINGCOMMENT&PLNGAPPL_REFERENCE=%(reference)s" + +authority_name = "South Oxfordshire District Council" +authority_short_name = "South Oxfordshire" + + +from PlanningUtils import fixNewlines, \ + getPostcodeFromText, \ + PlanningAuthorityResults, \ + PlanningApplication + +class SouthOxfordshireParser(HTMLParser.HTMLParser): + """In this case we'll take the date, so that we can avoid doing dowloads for + the other days in this week's file. This date should be a datetime.date object. + """ + def __init__(self): + HTMLParser.HTMLParser.__init__(self) + + self._requested_date = None + + # We'll keep a count of the number of tables we have seen. + # All the interesting stuff is in table 3 + self._table_count = 0 + + # While inside table 3, we'll keep a count of the number of + # s we have seen. What is in which numbered is detailed below. + # 1 reference + # 3 place and description + # 5 date received + # 2 and 4 are just padding + self._td_count = 0 + + # This is just a flag to say that we are now ready to get the reference + # from the next bit of data + self._get_reference = False + + self._data = '' + + # this will hold the application we are currently working on. + self._current_application = None + + # The object which stores our set of planning application results + self._results = PlanningAuthorityResults(authority_name, authority_short_name) + + def handle_starttag(self, tag, attrs): + # if we see a table tag, increment the table count. + if tag == 'table': + self._table_count += 1 + + # we are only interested in other tags if we are in table 3. + if self._table_count == 3: + + # If we are starting a , create a new PlanningApplication object + # for the application currently being processed + if tag == 'tr': + self._current_application = PlanningApplication() + + # if we see a td, increment the count. + if tag == 'td': + self._td_count += 1 + + # if we are in the first , and we see a link, + # then it is to the info page for this applicaion. + if tag == 'a' and self._td_count == 1: + for key, value in attrs: + if key == 'href': + url_end = value + self._current_application.info_url = urlparse.urljoin(search_url,url_end) + + # We now know that the next bit of data is the reference + self._get_reference = True + + # href is the only attribute we are interested in. + break + + def handle_endtag(self, tag): + # There is no need to do anything unless we are in table 3. + if self._table_count == 3: + + # The end indicates that the current application is finished. + # Now we can fetch the info_page to get the address, postcode, + # and description. + # If we don't have a reference, then we are in the header row, + # which we don't want. + # There is no point in doing this if the date is not the requested one. + + if tag == 'tr' and \ + self._current_application.council_reference is not None and \ + self._current_application.date_received == self._requested_date: + + info_page_parser = SouthOxfordshireInfoURLParser() + info_page_parser.feed(urllib2.urlopen(self._current_application.info_url).read()) + + self._current_application.address = info_page_parser.address + self._current_application.postcode = getPostcodeFromText(info_page_parser.address) + self._current_application.description = info_page_parser.description + + # Add the current application to the results set + self._results.addApplication(self._current_application) + + # At the end of the 5th , self._data should contain + # the received date of the application. + if tag == 'td' and self._td_count == 5: + app_year, app_month, app_day = tuple(time.strptime(self._data, "%d %B %Y")[:3]) + self._current_application.date_received = datetime.date(app_year, app_month, app_day) + self._data = '' + self._td_count = 0 + + def handle_data(self, data): + # There is no need to do anything if we aren't in table 3. + if self._table_count == 3: + # If we are in the first , and the get_reference flag is set, + # then the next data is the reference. + if self._td_count == 1 and self._get_reference: + self._current_application.council_reference = data + + # The comment url can now be made, as it depends only on the reference. + # On this site, the link to the comment page is only displayed once + # the planning authority has decided who is handling this application + # and has opened consultations. The link below works straight away, + # and also works for apps for which the consultation period is over. + # I have no idea if anything is actually done with these comments if + # it is followed too early... + self._current_application.comment_url = comment_url %{'reference': self._current_application.council_reference} + + # Set the get_reference flag back to False. + self._get_reference = False + + # If we are in the 5th , then we need to collect all the data together + # before we can use it. This is actually processed in handle_endtag. + if self._td_count == 5: + self._data += data + + def handle_entityref( self, ref ): + # We might have some entity_refs to clear up. + # there is no need to bother with this if we aren't in the results table. + if self._table_count == 3 and self._td_count == 5: + if ref == 'nbsp': + self._data += ' ' + + + def getResultsByDayMonthYear(self, day, month, year): + """This will return an ApplicationResults object containg the + applications for the date passed in.""" + + today = datetime.date.today() + self._requested_date = datetime.date(year, month, day) + delta = today - self._requested_date + + # to get the correct page, we need + # page ((days mod 7) + 1) + page_number = delta.days/7 + 1 + + response = urllib2.urlopen(search_url %page_number) + + contents = response.read() + + self.feed(contents) + + return self._results + + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +class SouthOxfordshireInfoURLParser(HTMLParser.HTMLParser): + """This parser is to get the description and address out of the info page + for a South Oxfordshire application.""" + + def __init__(self): + HTMLParser.HTMLParser.__init__(self) + + self.address = None + self.description = None + + # These two states will be set to: + # 0 - if we haven't yet got that bit + # 1 - if we are currently working on it + # 2 - if we have finished + self._address_state = 0 + self._description_state = 0 + + # We well need to know whether or not we are in a + self._in_td = False + + # This is used for collecting together date which comes in several bits. + self._data = '' + + def handle_starttag(self, tag, attrs): + # If we see the start of a and we are still interested in some data + # then set the td flag to true, and blank the data + if tag == 'td' and (self._address_state < 2 or self._description_state < 2): + self._in_td = True + self._data = '' + + def handle_endtag(self, tag): + if tag == 'td' and (self._address_state < 2 or self._description_state < 2): + # If we are working on the description, + # set description from _data and note that we need to work on it no more. + if self._description_state == 1: + self.description = self._data + self._description_state = 2 + + + # If we are working on the address, + # set address from _data and note that we need to work on it no more. + elif self._address_state == 1: + self.address = self._data + self._address_state = 2 + + # If we see data which says 'Descripton', + # then set the description state to working. + elif self._data.strip() == 'Description': + self._description_state = 1 + + # If we see data which says 'Location', + # then set the addresss state to working. + elif self._data.strip() == 'Location': + self._address_state = 1 + + # Note that we are leaving the + self._in_td = False + + def handle_data(self, data): + # if we are in a td, and we are still interested in the data for something, + # append the current bit to self._data + if self._in_td and (self._address_state < 2 or self._description_state < 2): + self._data += data + + +# TODO + +# find out what time of day this is run - does it matter that +# we aren't being careful with daylight saving time etc. + +# Can we check that scraped email address really is +# an email address? diff --git a/python_scrapers/createCGI.sh b/python_scrapers/createCGI.sh new file mode 100755 index 0000000..8e989bc --- /dev/null +++ b/python_scrapers/createCGI.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +echo Removing contents of CGI directory +svn rm --force ../CGI/* + +echo Running generateCGIScripts +python generateCGIScripts.py + +svn add ../CGI/* diff --git a/python_scrapers/generateCGIScripts.py b/python_scrapers/generateCGIScripts.py new file mode 100755 index 0000000..d66e715 --- /dev/null +++ b/python_scrapers/generateCGIScripts.py @@ -0,0 +1,58 @@ +#!/usr/bin/python + +list_of_sites_filename = "PublicAccessSites.csv" +template_filename = "CGITemplate" +python_location = "/usr/bin/python" + +cgi_dir = "../CGI/" + +# this should be a config file +other_files = ["PublicAccess.py", "PlanningUtils.py", "SouthOxfordshireParser.py", "SouthOxfordshire.cgi"] + +import csv +from os import chmod +from shutil import copyfile + +list_of_sites_file = open(list_of_sites_filename) +csv_reader = csv.DictReader(list_of_sites_file, quoting=csv.QUOTE_ALL, skipinitialspace=True) + +# svn rm the cgi directory + +# create the cgi directory + + +# create cgi files and write them in the cgi directory +template_contents = open(template_filename).read() + +template = "#!" + python_location +"\n\n" + template_contents + +for site_dict in csv_reader: + filename = cgi_dir + "%s.cgi" %site_dict["authority_short_name"] + contents = template %site_dict + + this_file = open(filename, "w") + print "Writing %s" %filename + this_file.write(contents) + this_file.close() + + chmod(filename, 0755) + +# copy across other files that are needed +# these should probably come from a config file +for filename in other_files: + copyfile(filename, cgi_dir+filename) + + +# write a README to warn people not to svn add stuff to CGI directory +readme_message = """ +WARNING - this directory is only for generated files +and files which are automatically copied in. +Anything manually added here will be svn deleted. + +""" +readme_file = open(cgi_dir+ "README", "w") +readme_file.write(readme_message) +readme_file.close() + +# svn add the cgi directory and its contents +