Procházet zdrojové kódy

Add more debug.

master
duncan.parkes před 16 roky
rodič
revize
240f1b2d84
2 změnil soubory, kde provedl 19 přidání a 4 odebrání
  1. +3
    -4
      python_scrapers/CGITemplate
  2. +16
    -0
      python_scrapers/Westminster.py

+ 3
- 4
python_scrapers/CGITemplate Zobrazit soubor

@@ -19,12 +19,11 @@ base_url = "%(base_url)s"
#print "Content-Type: text/html" # HTML is following #print "Content-Type: text/html" # HTML is following
#print #print


import %(module)s

parser = %(module)s.%(parser)s(authority_name, authority_short_name, base_url)

print "Content-Type: text/xml; charset=utf-8" # XML is following print "Content-Type: text/xml; charset=utf-8" # XML is following
print print


import %(module)s
parser = %(module)s.%(parser)s(authority_name, authority_short_name, base_url)

xml = parser.getResults(day, month, year) xml = parser.getResults(day, month, year)
print xml.encode("utf-8") # print the xml print xml.encode("utf-8") # print the xml

+ 16
- 0
python_scrapers/Westminster.py Zobrazit soubor

@@ -58,6 +58,7 @@ class WestminsterParser:
post_data = "REFNO=&STName=&STNUMB=&ADRSNO=&WARD=AllWards&AGT=&ATCDE=AllApps&DECDE=AllDecs&DTErec=%(date)s&DTErecTo=%(date)s&DTEvalid=&DTEvalidTo=&APDECDE=AllAppDecs&submit=Start+Search" %{"date": search_day.strftime(date_format)} post_data = "REFNO=&STName=&STNUMB=&ADRSNO=&WARD=AllWards&AGT=&ATCDE=AllApps&DECDE=AllDecs&DTErec=%(date)s&DTErecTo=%(date)s&DTEvalid=&DTEvalidTo=&APDECDE=AllAppDecs&submit=Start+Search" %{"date": search_day.strftime(date_format)}


while post_data: while post_data:


# Now get the search page # Now get the search page


@@ -69,19 +70,30 @@ class WestminsterParser:
sys.stderr.write("Got it\n") sys.stderr.write("Got it\n")
soup = BeautifulSoup(response.read()) soup = BeautifulSoup(response.read())


sys.stderr.write("Created soup\n")

results_form = soup.find("form", {"name": "currentsearchresultsNext"}) results_form = soup.find("form", {"name": "currentsearchresultsNext"})


# Sort out the post_data for the next page, if there is one # Sort out the post_data for the next page, if there is one
# If there is no next page then there will be no inputs in the form. # If there is no next page then there will be no inputs in the form.
# In this case, post_data will be '', which is false. # In this case, post_data will be '', which is false.


sys.stderr.write("Found form containing results\n")

post_data = urllib.urlencode([(x['name'], x['value']) for x in results_form.findAll("input")]) post_data = urllib.urlencode([(x['name'], x['value']) for x in results_form.findAll("input")])


sys.stderr.write("Got post data\n")

# Each result has one link, and they are the only links in the form # Each result has one link, and they are the only links in the form


links = results_form.findAll("a") links = results_form.findAll("a")


sys.stderr.write("Got list of links\n")

for link in links: for link in links:

sys.stderr.write("Working on link: %s\n" %link['href'])

application = PlanningApplication() application = PlanningApplication()


application.date_received = search_day application.date_received = search_day
@@ -111,6 +123,10 @@ class WestminsterParser:


self._results.addApplication(application) self._results.addApplication(application)


sys.stderr.write("Finished that link\n")


sys.stderr.write("Finished while loop, returning stuff.\n")


return self._results return self._results




Načítá se…
Zrušit
Uložit