import os import sys import random import time import urllib.error import urllib.request from lxml import html targetDir=sys.argv[1] linksFile=sys.argv[2] with open(linksFile) as f: links=f.readlines() for link in links: link=link.strip() if link.startswith("#"): continue time.sleep(1+2*random.random()) try: with urllib.request.urlopen(link) as req: if req.getcode()!=200: print("error when downloading {0}".format(link),file=sys.stderr) continue doc_str=req.read().decode("utf8") except urllib.error.HTTPError: print("error when downloading {0}".format(link),file=sys.stderr) continue doc=html.fromstring(doc_str) try: license=doc.xpath("//div[@class='photo-license-info']//span/text()")[0].strip() imgUrl=doc.xpath("//meta[@property='og:image']/@content")[0] except IndexError: print("error when parsing {0}".format(link),file=sys.stderr) continue filename=imgUrl.split("/")[-1].split("?")[0] with urllib.request.urlopen(imgUrl) as req: if req.getcode()!=200: print("error when downloading {0}".format(link),file=sys.stderr) continue with open(os.path.join(targetDir,filename),mode="wb") as f: f.write(req.read()) print("\t".join(map(str, [link,filename,license])), flush=True)