|
1234567891011121314151617181920212223 |
- # Extract URLs from a web page to a CSV file
- # $ python extract-urls.py http://mysite.com/mypage.html myfile.csv
- # By Adrian Short 6 Sep 2012
-
- import sys
- import urllib
- import csv
- from bs4 import BeautifulSoup
-
- url = sys.argv.pop(1)
- out_fn = sys.argv.pop(1) # output filename for CSV file
-
- infile = urllib.urlopen(url)
- html = infile.read()
- soup = BeautifulSoup(html)
-
- with open(out_fn, 'wb') as outfile:
- writer = csv.writer(outfile)
-
- # You can use a CSS selector as an alias for find_all()
- for link in soup('a'):
- writer.writerow([link.string, link.get('href')])
-
|