import os
import sys
import random
import time
import urllib.error
import urllib.request
from lxml import html
targetDir=sys.argv[1]
linksFile=sys.argv[2]
with open(linksFile) as f:
links=f.readlines()
for link in links:
link=link.strip()
if link.startswith("#"): continue
time.sleep(1+2*random.random())
try:
with urllib.request.urlopen(link) as req:
if req.getcode()!=200:
print("error when downloading {0}".format(link),file=sys.stderr)
continue
doc_str=req.read().decode("utf8")
except urllib.error.HTTPError:
print("error when downloading {0}".format(link),file=sys.stderr)
continue
doc=html.fromstring(doc_str)
try:
license=doc.xpath("//div[@class='photo-license-info']//span/text()")[0].strip()
imgUrl=doc.xpath("//meta[@property='og:image']/@content")[0]
except IndexError:
print("error when parsing {0}".format(link),file=sys.stderr)
continue
filename=imgUrl.split("/")[-1].split("?")[0]
with urllib.request.urlopen(imgUrl) as req:
if req.getcode()!=200:
print("error when downloading {0}".format(link),file=sys.stderr)
continue
with open(os.path.join(targetDir,filename),mode="wb") as f:
f.write(req.read())
print("\t".join(map(str, [link,filename,license])), flush=True)