import os import sys import random import time import urllib.request from lxml import html targetDir="" links=sys.argv for link in links: time.sleep(1+random.random()) with urllib.request.urlopen(link) as req: if req.getcode()!=200: print("error when downloading {0}".format(link),file=sys.stderr) continue doc_str=req.read().decode("utf8") doc=html.fromstring(doc_str) license=doc.xpath("//div[@class='photo-license-info']//span/text()")[0].strip() imgUrl=doc.xpath("//meta[@property='og:image']/@content")[0] filename=imgUrl.split("/")[-1] with urllib.request.urlopen(imgUrl) as req: if req.getcode()!=200: print("error when downloading {0}".format(link),file=sys.stderr) continue with open(os.path.join(targetDir,filename),mode="wb") as f: f.write(req.read()) print("\t".join(map(str, [link,filename,license])))