|
|
@@ -0,0 +1,23 @@ |
|
|
|
# Extract URLs from a web page to a CSV file |
|
|
|
# $ python extract-urls.py http://mysite.com/mypage.html myfile.csv |
|
|
|
# By Adrian Short 6 Sep 2012 |
|
|
|
|
|
|
|
import sys |
|
|
|
import urllib |
|
|
|
import csv |
|
|
|
from bs4 import BeautifulSoup |
|
|
|
|
|
|
|
url = sys.argv.pop(1) |
|
|
|
out_fn = sys.argv.pop(1) # output filename for CSV file |
|
|
|
|
|
|
|
infile = urllib.urlopen(url) |
|
|
|
html = infile.read() |
|
|
|
soup = BeautifulSoup(html) |
|
|
|
|
|
|
|
with open(out_fn, 'wb') as outfile: |
|
|
|
writer = csv.writer(outfile) |
|
|
|
|
|
|
|
# You can use a CSS selector as an alias for find_all() |
|
|
|
for link in soup('a'): |
|
|
|
writer.writerow([link.string, link.get('href')]) |
|
|
|
|