commit bc2b77399aa2985617ee84c94d20457bed8f212e Author: Adrian Short Date: Thu Sep 6 10:30:12 2012 -0700 diff --git a/extract-urls.py b/extract-urls.py new file mode 100644 index 0000000..217d9f9 --- /dev/null +++ b/extract-urls.py @@ -0,0 +1,23 @@ +# Extract URLs from a web page to a CSV file +# $ python extract-urls.py http://mysite.com/mypage.html myfile.csv +# By Adrian Short 6 Sep 2012 + +import sys +import urllib +import csv +from bs4 import BeautifulSoup + +url = sys.argv.pop(1) +out_fn = sys.argv.pop(1) # output filename for CSV file + +infile = urllib.urlopen(url) +html = infile.read() +soup = BeautifulSoup(html) + +with open(out_fn, 'wb') as outfile: + writer = csv.writer(outfile) + + # You can use a CSS selector as an alias for find_all() + for link in soup('a'): + writer.writerow([link.string, link.get('href')]) + \ No newline at end of file