Merton Council planning applications
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

parser.rb 4.8 KiB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. require 'nokogiri'
  2. require 'breasal'
  3. require 'date'
  4. require 'uri'
  5. require 'pp'
  6. def clean_end(s)
  7. # Removes trailing spaces including Unicode whitespace (eg char 160) from the end of a string
  8. # Returns nil if the resulting string is empty
  9. s.strip!
  10. s.sub!(/\p{Zs}+$/, '')
  11. return nil if s == ''
  12. s
  13. end
  14. def cleanup(items)
  15. # Regex doesn't work across multiple text lines by default
  16. items.map { |i| i.inner_html.strip.gsub(/&.+;/, '').gsub(/<span>.*<\/span>/m, '').gsub(/[\t\r\n]/m, '') }
  17. end
  18. def parse_details(html)
  19. doc = Nokogiri::HTML(html)
  20. app = {}
  21. lists = doc.search("ul.list")
  22. # First ul is Application Progress Summary
  23. items = lists[0].search("li div")
  24. values = cleanup(items)
  25. app['date_received'] = Date.parse(values[0]) if values[0].match(DATE_REGEX)
  26. app['status'] = clean_end(values[1])
  27. app['on_notice_to'] = Date.parse(values[2]) if values[2].match(DATE_REGEX)
  28. app['recommendation'] = clean_end(values[3])
  29. app['date_committee'] = Date.parse(values[4]) if values[4].match(DATE_REGEX)
  30. app['decision'] = clean_end(values[5])
  31. app['date_appeal_lodged'] = Date.parse(values[6]) if values[6].match(DATE_REGEX) # FIXME Is this actually a date or a Yes/No?
  32. app['appeal_decision'] = clean_end(values[7])
  33. # Second ul is Application Details
  34. items = lists[1].search("li div")
  35. # Regex doesn't work across multiple text lines by default
  36. values = items.map { |i| i.inner_html.strip.gsub(/&.+;/m, '') }
  37. app['council_reference'] = clean_end(items[0].children[2].inner_text)
  38. app['application_type'] = clean_end(items[2].children[2].inner_text)
  39. app['applicant_name'] = clean_end(items[5].children[2].inner_text)
  40. app['agent_name'] = clean_end(items[6].children[2].inner_text)
  41. app['wards'] = clean_end(items[7].children[2].inner_text)
  42. en_string = values[8].match(/Easting.+?(\d+).+?Northing.+?(\d+)/)
  43. app['easting'] = en_string[1].to_i
  44. app['northing'] = en_string[2].to_i
  45. en = Breasal::EastingNorthing.new(easting: app['easting'], northing: app['northing'], type: :gb)
  46. app['latitude'] = en.to_wgs84[:latitude]
  47. app['longitude'] = en.to_wgs84[:longitude]
  48. app['appeal_submitted'] = clean_end(items[9].children[2].inner_text)
  49. app['appeal_decision'] = clean_end(items[10].children[2].inner_text)
  50. if items[11].children[2].inner_text.match(/\d+/)
  51. app['case_officer_phone'] = clean_end(items[11].children[2].inner_text.gsub(/[\r\n\t]/, '')).match(/(\d+)/)[1].sub(/^44/, '0')
  52. end
  53. app['division'] = clean_end(items[12].children[2].inner_text.gsub('-', ''))
  54. app['case_officer_name'] = clean_end(items[13].children[2].inner_text)
  55. app['determination_level'] = clean_end(items[14].children[2].inner_text)
  56. app['existing_land_use'] = clean_end(items[15].children[2].inner_text)
  57. app['proposed_land_use'] = clean_end(items[16].children[2].inner_text)
  58. # Third ul is Other Information Available for Planning Application...
  59. links = doc.search("a.FooterLinks")
  60. app['documents_url'] = SITE_URL + links[0]['href'].gsub(/[\r\n\t]/, '')
  61. app['dates_url'] = URI::encode(BASE_URL + links[1]['href']).gsub(/%0./m, '')
  62. app['checks_url'] = URI::encode(BASE_URL + links[2]['href']).gsub(/%0./m, '')
  63. app['meetings_url'] = URI::encode(BASE_URL + links[3]['href']).gsub(/%0./m, '')
  64. app['constraints_url'] = URI::encode(BASE_URL + links[4]['href']).gsub(/%0./m, '')
  65. app['site_history_url'] = URI::encode(BASE_URL + links[5]['href']).gsub(/%0./m, '') if links[5]
  66. app
  67. end
  68. def parse_dates(html)
  69. doc = Nokogiri::HTML(html)
  70. app = {}
  71. dates = []
  72. doc.search(".dataview ul div").each { |row| dates << row.children[2].inner_text }
  73. app['date_received'] = Date.parse(dates[0]) if dates[0].match(DATE_REGEX)
  74. app['date_first_advertised'] = Date.parse(dates[1]) if dates[1].match(DATE_REGEX)
  75. app['date_registered'] = Date.parse(dates[2]) if dates[2].match(DATE_REGEX)
  76. app['date_first_site_notice'] = Date.parse(dates[3]) if dates[3].match(DATE_REGEX)
  77. app['date_valid'] = Date.parse(dates[4]) if dates[4].match(DATE_REGEX)
  78. app['on_notice_to'] = Date.parse(dates[5]) if dates[5].match(DATE_REGEX)
  79. app['date_validated'] = Date.parse(dates[6]) if dates[6].match(DATE_REGEX)
  80. app['target_date'] = Date.parse(dates[7]) if dates[7].match(DATE_REGEX)
  81. app['stat_cons_expiry_date'] = Date.parse(dates[8]) if dates[8].match(DATE_REGEX)
  82. app['decision_expiry_date'] = Date.parse(dates[9]) if dates[9].match(DATE_REGEX)
  83. app['first_consultation_date'] = Date.parse(dates[10]) if dates[10].match(DATE_REGEX)
  84. app['extended_expiry_date'] = Date.parse(dates[11]) if dates[11].match(DATE_REGEX)
  85. app
  86. end
  87. def parse_documents(html)
  88. doc = Nokogiri::HTML(html)
  89. docs = []
  90. doc.search("#tblContent td a").each do |d|
  91. # title = d.inner_text.strip.match(/^[\d\w]+?_\s*(.+?)\.pdf/)[1].gsub('_', ' ')
  92. docs << {
  93. 'title' => d.inner_text.strip,
  94. 'url' => URI::encode(SITE_URL + d['href']),
  95. 'date_last_seen' => Date.today.to_s
  96. }
  97. end
  98. docs
  99. end