From bc2b77399aa2985617ee84c94d20457bed8f212e Mon Sep 17 00:00:00 2001 From: Adrian Short Date: Thu, 6 Sep 2012 10:30:12 -0700 Subject: [PATCH] --- extract-urls.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 extract-urls.py diff --git a/extract-urls.py b/extract-urls.py new file mode 100644 index 0000000..217d9f9 --- /dev/null +++ b/extract-urls.py @@ -0,0 +1,23 @@ +# Extract URLs from a web page to a CSV file +# $ python extract-urls.py http://mysite.com/mypage.html myfile.csv +# By Adrian Short 6 Sep 2012 + +import sys +import urllib +import csv +from bs4 import BeautifulSoup + +url = sys.argv.pop(1) +out_fn = sys.argv.pop(1) # output filename for CSV file + +infile = urllib.urlopen(url) +html = infile.read() +soup = BeautifulSoup(html) + +with open(out_fn, 'wb') as outfile: + writer = csv.writer(outfile) + + # You can use a CSS selector as an alias for find_all() + for link in soup('a'): + writer.writerow([link.string, link.get('href')]) + \ No newline at end of file