From bc2b77399aa2985617ee84c94d20457bed8f212e Mon Sep 17 00:00:00 2001
From: Adrian Short <adrian@adrianshort.co.uk>
Date: Thu, 6 Sep 2012 10:30:12 -0700
Subject: [PATCH]

---
 extract-urls.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 extract-urls.py

diff --git a/extract-urls.py b/extract-urls.py
new file mode 100644
index 0000000..217d9f9
--- /dev/null
+++ b/extract-urls.py
@@ -0,0 +1,23 @@
+# Extract URLs from a web page to a CSV file
+# $ python extract-urls.py http://mysite.com/mypage.html myfile.csv
+# By Adrian Short 6 Sep 2012
+  
+import sys
+import urllib
+import csv
+from bs4 import BeautifulSoup
+
+url = sys.argv.pop(1)
+out_fn = sys.argv.pop(1) # output filename for CSV file
+
+infile = urllib.urlopen(url)
+html = infile.read()
+soup = BeautifulSoup(html)
+
+with open(out_fn, 'wb') as outfile:
+    writer = csv.writer(outfile)
+    
+    # You can use a CSS selector as an alias for find_all()
+    for link in soup('a'):
+        writer.writerow([link.string, link.get('href')])
+        
\ No newline at end of file