Files
@ f1f8a2421f92
Branch filter:
Location: OneEye/exp/collector.py - annotation
f1f8a2421f92
1.2 KiB
text/x-python
updated readme
a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 1d1fd2ae49bf a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf a1e2cc709d53 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 1d1fd2ae49bf | import os
import sys
import random
import time
import urllib.error
import urllib.request
from lxml import html
targetDir=sys.argv[1]
linksFile=sys.argv[2]
with open(linksFile) as f:
links=f.readlines()
for link in links:
link=link.strip()
if link.startswith("#"): continue
time.sleep(1+2*random.random())
try:
with urllib.request.urlopen(link) as req:
if req.getcode()!=200:
print("error when downloading {0}".format(link),file=sys.stderr)
continue
doc_str=req.read().decode("utf8")
except urllib.error.HTTPError:
print("error when downloading {0}".format(link),file=sys.stderr)
continue
doc=html.fromstring(doc_str)
try:
license=doc.xpath("//div[@class='photo-license-info']//span/text()")[0].strip()
imgUrl=doc.xpath("//meta[@property='og:image']/@content")[0]
except IndexError:
print("error when parsing {0}".format(link),file=sys.stderr)
continue
filename=imgUrl.split("/")[-1].split("?")[0]
with urllib.request.urlopen(imgUrl) as req:
if req.getcode()!=200:
print("error when downloading {0}".format(link),file=sys.stderr)
continue
with open(os.path.join(targetDir,filename),mode="wb") as f:
f.write(req.read())
print("\t".join(map(str, [link,filename,license])), flush=True)
|