Extract URLs from a web page
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

23 lines
595 B

  1. # Extract URLs from a web page to a CSV file
  2. # $ python extract-urls.py http://mysite.com/mypage.html myfile.csv
  3. # By Adrian Short 6 Sep 2012
  4. import sys
  5. import urllib
  6. import csv
  7. from bs4 import BeautifulSoup
  8. url = sys.argv.pop(1)
  9. out_fn = sys.argv.pop(1) # output filename for CSV file
  10. infile = urllib.urlopen(url)
  11. html = infile.read()
  12. soup = BeautifulSoup(html)
  13. with open(out_fn, 'wb') as outfile:
  14. writer = csv.writer(outfile)
  15. # You can use a CSS selector as an alias for find_all()
  16. for link in soup('a'):
  17. writer.writerow([link.string, link.get('href')])