diff --git a/exp/collector.py b/exp/collector.py --- a/exp/collector.py +++ b/exp/collector.py @@ -2,26 +2,40 @@ import os import sys import random import time +import urllib.error import urllib.request from lxml import html -targetDir="" -links=sys.argv +targetDir=sys.argv[1] +linksFile=sys.argv[2] +with open(linksFile) as f: + links=f.readlines() + for link in links: - time.sleep(1+random.random()) - with urllib.request.urlopen(link) as req: - if req.getcode()!=200: - print("error when downloading {0}".format(link),file=sys.stderr) - continue - doc_str=req.read().decode("utf8") + link=link.strip() + if link.startswith("#"): continue + time.sleep(1+2*random.random()) + try: + with urllib.request.urlopen(link) as req: + if req.getcode()!=200: + print("error when downloading {0}".format(link),file=sys.stderr) + continue + doc_str=req.read().decode("utf8") + except urllib.error.HTTPError: + print("error when downloading {0}".format(link),file=sys.stderr) + continue doc=html.fromstring(doc_str) - license=doc.xpath("//div[@class='photo-license-info']//span/text()")[0].strip() - imgUrl=doc.xpath("//meta[@property='og:image']/@content")[0] - filename=imgUrl.split("/")[-1] + try: + license=doc.xpath("//div[@class='photo-license-info']//span/text()")[0].strip() + imgUrl=doc.xpath("//meta[@property='og:image']/@content")[0] + except IndexError: + print("error when parsing {0}".format(link),file=sys.stderr) + continue + filename=imgUrl.split("/")[-1].split("?")[0] with urllib.request.urlopen(imgUrl) as req: if req.getcode()!=200: @@ -30,4 +44,4 @@ for link in links: with open(os.path.join(targetDir,filename),mode="wb") as f: f.write(req.read()) - print("\t".join(map(str, [link,filename,license]))) + print("\t".join(map(str, [link,filename,license])), flush=True)