Sfoglia il codice sorgente

Allow & in place of & for one of the urls (to fix Oldham, which has suddenly started to need this...)

import/raw
duncan.parkes 17 anni fa
parent
commit
562d4a64c5
2 ha cambiato i file con 16 aggiunte e 10 eliminazioni
  1. +15
    -9
      trunk/python_scrapers/AcolnetParser.py
  2. +1
    -1
      trunk/python_scrapers/SitesToGenerate.csv

+ 15
- 9
trunk/python_scrapers/AcolnetParser.py Vedi File

@@ -41,7 +41,9 @@ class AcolnetParser(HTMLParser.HTMLParser):
# appropriate email address instead
comments_email_address = None

action_regex = re.compile("<form[^>]*action=\"([^\"]*ACTION=UNWRAP&RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)
# The optional amp; is to cope with Oldham, which seems to have started
# quoting this url.
action_regex = re.compile("<form[^>]*action=\"([^\"]*ACTION=UNWRAP&(?:amp;)?RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)
def __init__(self,
authority_name,
@@ -175,9 +177,6 @@ class AcolnetParser(HTMLParser.HTMLParser):
search_form_contents = search_form_response.read()

#outfile = open("tmpfile", "w")
#outfile.write(search_form_contents)

# This sometimes causes a problem in HTMLParser, so let's just get the link
# out with a regex...

@@ -186,8 +185,12 @@ class AcolnetParser(HTMLParser.HTMLParser):
action = groups[0]
#print action

# This is to handle the amp; which seems to have appeared in this
# url on the Oldham site
action = ''.join(action.split('amp;'))

action_url = urlparse.urljoin(self.base_url, action)
#print action_url
print action_url

our_date = date(year, month, day)
@@ -199,6 +202,9 @@ class AcolnetParser(HTMLParser.HTMLParser):
response = opener.open(action_url, search_data)
results_html = response.read()

#outfile = open("tmpfile", "w")
#outfile.write(results_html)

# This is for doing site specific html cleanup
results_html = self._cleanupHTML(results_html)

@@ -508,9 +514,9 @@ class SurreyHeathParser(AcolnetParser):
if __name__ == '__main__':
day = 22
month = 2
year = 2005
day = 20
month = 11
year = 2007

# returns error 400 - bad request
#parser = BridgenorthParser()
@@ -523,6 +529,6 @@ if __name__ == '__main__':

#parser = SurreyHeathParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")

parser = GuildfordParser("Guildford", "Guildford", "http://www.guildford.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&Root=PgeSearch")
parser = OldhamParser("Oldham", "Oldham", "http://planning.oldham.gov.uk/planning/AcolNetCGI.gov?ACTION=UNWRAP&Root=PgeSearch")
print parser.getResults(day, month, year)

+ 1
- 1
trunk/python_scrapers/SitesToGenerate.csv Vedi File

@@ -104,7 +104,7 @@
"Carlisle City Council", "Carlisle", "http://planning.carlisle.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "CarlisleParser"
"Newcastle City Council", "Newcastle", "http://gispublic.newcastle.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
"North Wiltshire District Council", "North Wiltshire", "http://planning.northwilts.gov.uk/DCOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "NorthWiltshireParser"
"Oldham Metropolitan Borough Council", "Oldham", "http://planning.oldham.gov.uk/planning//acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "OldhamParser"
"Oldham Metropolitan Borough Council", "Oldham", "http://planning.oldham.gov.uk/planning/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "OldhamParser"
"Renfrewshire Council", "Renfrewshire", "http://planning.renfrewshire.gov.uk/acolnetDCpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch", "AcolnetParser", "RenfrewshireParser"
"Selby District Council", "Selby", "http://publicaccess.selby.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
"South Bedfordshire District Council", "South Bedfordshire", "http://planning.southbeds.gov.uk/plantech/DCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.PgeSearch", "AcolnetParser", "SouthBedfordshireParser"


Caricamento…
Annulla
Salva