Files
@ 5617054647db
Branch filter:
Location: OneEye/exp/collector.py - annotation
5617054647db
1.2 KiB
text/x-python
grid construction and evaluation
a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 1d1fd2ae49bf a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf a1e2cc709d53 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 1d1fd2ae49bf | import os
import sys
import random
import time
import urllib.error
import urllib.request
from lxml import html
targetDir=sys.argv[1]
linksFile=sys.argv[2]
with open(linksFile) as f:
links=f.readlines()
for link in links:
link=link.strip()
if link.startswith("#"): continue
time.sleep(1+2*random.random())
try:
with urllib.request.urlopen(link) as req:
if req.getcode()!=200:
print("error when downloading {0}".format(link),file=sys.stderr)
continue
doc_str=req.read().decode("utf8")
except urllib.error.HTTPError:
print("error when downloading {0}".format(link),file=sys.stderr)
continue
doc=html.fromstring(doc_str)
try:
license=doc.xpath("//div[@class='photo-license-info']//span/text()")[0].strip()
imgUrl=doc.xpath("//meta[@property='og:image']/@content")[0]
except IndexError:
print("error when parsing {0}".format(link),file=sys.stderr)
continue
filename=imgUrl.split("/")[-1].split("?")[0]
with urllib.request.urlopen(imgUrl) as req:
if req.getcode()!=200:
print("error when downloading {0}".format(link),file=sys.stderr)
continue
with open(os.path.join(targetDir,filename),mode="wb") as f:
f.write(req.read())
print("\t".join(map(str, [link,filename,license])), flush=True)
|