Files
@ 6180b3bd7f3f
Branch filter:
Location: OneEye/exp/collector.py - annotation
6180b3bd7f3f
1.2 KiB
text/x-python
generating synthetic samples
a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 1d1fd2ae49bf a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf a1e2cc709d53 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf 1d1fd2ae49bf a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 a1e2cc709d53 1d1fd2ae49bf | import os
import sys
import random
import time
import urllib.error
import urllib.request
from lxml import html
targetDir=sys.argv[1]
linksFile=sys.argv[2]
with open(linksFile) as f:
links=f.readlines()
for link in links:
link=link.strip()
if link.startswith("#"): continue
time.sleep(1+2*random.random())
try:
with urllib.request.urlopen(link) as req:
if req.getcode()!=200:
print("error when downloading {0}".format(link),file=sys.stderr)
continue
doc_str=req.read().decode("utf8")
except urllib.error.HTTPError:
print("error when downloading {0}".format(link),file=sys.stderr)
continue
doc=html.fromstring(doc_str)
try:
license=doc.xpath("//div[@class='photo-license-info']//span/text()")[0].strip()
imgUrl=doc.xpath("//meta[@property='og:image']/@content")[0]
except IndexError:
print("error when parsing {0}".format(link),file=sys.stderr)
continue
filename=imgUrl.split("/")[-1].split("?")[0]
with urllib.request.urlopen(imgUrl) as req:
if req.getcode()!=200:
print("error when downloading {0}".format(link),file=sys.stderr)
continue
with open(os.path.join(targetDir,filename),mode="wb") as f:
f.write(req.read())
print("\t".join(map(str, [link,filename,license])), flush=True)
|