#!/usr/bin/env python list_of_sites_filename = "SitesToGenerate.csv" other_files_to_copy_filename = "OtherFilesToCopy.csv" other_files_location = "python_scrapers/" template_filename = "python_scrapers/CGITemplate.py" cgi_dir = "cgi-bin/" import csv import urllib from os import chmod, environ from shutil import copyfile import MySQLdb # First, copy across files that are needed in the CGI directory # that aren't generated. other_files_to_copy = open(other_files_to_copy_filename) other_files_csv_reader = csv.DictReader( other_files_to_copy, quoting=csv.QUOTE_ALL, skipinitialspace=True, ) for file_dict in other_files_csv_reader: filename = file_dict["filename"] copyfile(other_files_location + filename, cgi_dir+filename) # the idea here is to have filename and permissions # in the csv file. # Until version 2.6 of python, there is no easy way # to convert a string to an octal, so I am using # integers to represent permissions... # see README for details. chmod(cgi_dir+filename, int(file_dict["permissions"])) # Next we generate the cgi files list_of_sites_file = open(list_of_sites_filename) csv_reader = csv.DictReader( list_of_sites_file, quoting=csv.QUOTE_ALL, skipinitialspace=True, ) # create cgi files and write them in the cgi directory template= open(template_filename).read() # Get a mysql cursor mysql_connection = MySQLdb.connect( db=environ['MYSQL_DB_NAME'], user=environ['MYSQL_USERNAME'], passwd=environ['MYSQL_PASSWORD'], ) mysql_cursor = mysql_connection.cursor() python_scraper_location = "/cgi-bin/%s.cgi?day={day}&month={month}&year={year}" php_scraper_location = "/scrapers/%(php_scraper)s.php?day={day}&month={month}&year={year}" # All of this should probably be done with SqlAlchemy or something. authority_select_query = "SELECT * FROM authority WHERE short_name = '%(short_name)s';" # FIXME: Both of these queries should set planning_email and notes. authority_insert_query = 'INSERT INTO authority (full_name, short_name, feed_url, external, disabled, planning_email) values ("%(full_name)s", "%(short_name)s", "%(feed_url)s", %(external)s, %(disabled)s, "%(planning_email)s");' authority_update_query = 'UPDATE authority SET full_name="%(full_name)s", external="%(external)s", disabled=%(disabled)s, feed_url="%(feed_url)s", external=%(external)s WHERE short_name = "%(short_name)s";' for site_dict in csv_reader: # We need these to be 1 or 0 to pass them into mysql. site_dict['external'] = 1 if site_dict['external'] else 0 site_dict['disabled'] = 1 if site_dict['disabled'] else 0 if site_dict['external']: # This scraper is somewhere else. pass elif site_dict['feed_url']: # This scraper is local and uses an non-generated file in cgi-bin pass elif site_dict['php_scraper']: # Uses a PHP scraper. site_dict['feed_url'] = php_scraper_location %site_dict elif site_dict['python_module'] and site_dict['parser_class']: # We need to generate a python CGI file file_location = cgi_dir + "%(short_name)s.cgi" %site_dict contents = template %site_dict this_file = open(file_location, "w") this_file.write(contents) this_file.close() chmod(file_location, 0755) quoted_short_name = urllib.quote(site_dict["short_name"]) site_dict['feed_url'] = python_scraper_location %(quoted_short_name) else: # Something has gone wrong. print "ERROR: Config for %(short_name)s is faulty." %site_dict # print "Disabling this scraper" # FIXME: Should have a query here to set disabled for this scraper. continue # Do we have a record for this authority already? row_count = mysql_cursor.execute(authority_select_query %site_dict) if row_count > 1: print "ERROR: There is more than one row for %(short_name)s." %site_dict print "Skipping this scraper." continue elif row_count == 1: mysql_cursor.execute(authority_update_query %site_dict) elif row_count == 0: mysql_cursor.execute(authority_insert_query %site_dict) else: print "ERROR: How on earth did we get here? Row count is %s" %(row_count) # write a README to warn people not to svn add stuff to CGI directory readme_message = """ WARNING - this directory is only for generated files and files which are automatically copied in. Anything manually added here will be lost. """ readme_file = open(cgi_dir + "README", "w") readme_file.write(readme_message) readme_file.close()