| @@ -0,0 +1,23 @@ | |||||
| # Extract URLs from a web page to a CSV file | |||||
| # $ python extract-urls.py http://mysite.com/mypage.html myfile.csv | |||||
| # By Adrian Short 6 Sep 2012 | |||||
| import sys | |||||
| import urllib | |||||
| import csv | |||||
| from bs4 import BeautifulSoup | |||||
| url = sys.argv.pop(1) | |||||
| out_fn = sys.argv.pop(1) # output filename for CSV file | |||||
| infile = urllib.urlopen(url) | |||||
| html = infile.read() | |||||
| soup = BeautifulSoup(html) | |||||
| with open(out_fn, 'wb') as outfile: | |||||
| writer = csv.writer(outfile) | |||||
| # You can use a CSS selector as an alias for find_all() | |||||
| for link in soup('a'): | |||||
| writer.writerow([link.string, link.get('href')]) | |||||