# HG changeset patch # User Laman # Date 2018-12-15 09:20:11 # Node ID a1e2cc709d537226fc23c32b7a46510e22f6e281 # Parent 9df0d1a019c5c96cf56425f4f19a9844a2e50772 photo collector diff --git a/exp/collector.py b/exp/collector.py new file mode 100644 --- /dev/null +++ b/exp/collector.py @@ -0,0 +1,33 @@ +import os +import sys +import random +import time +import urllib.request + +from lxml import html + + +targetDir="" +links=sys.argv +for link in links: + time.sleep(1+random.random()) + with urllib.request.urlopen(link) as req: + if req.getcode()!=200: + print("error when downloading {0}".format(link),file=sys.stderr) + continue + doc_str=req.read().decode("utf8") + + doc=html.fromstring(doc_str) + + license=doc.xpath("//div[@class='photo-license-info']//span/text()")[0].strip() + imgUrl=doc.xpath("//meta[@property='og:image']/@content")[0] + filename=imgUrl.split("/")[-1] + + with urllib.request.urlopen(imgUrl) as req: + if req.getcode()!=200: + print("error when downloading {0}".format(link),file=sys.stderr) + continue + with open(os.path.join(targetDir,filename),mode="wb") as f: + f.write(req.read()) + + print("\t".join(map(str, [link,filename,license])))