OneEye Files · exp/collector.py

Files @ 28c6f89a3a7e

Branch filter:

Location: OneEye/exp/collector.py - annotation

28c6f89a3a7e 1.2 KiB text/x-python Show Source Show as Raw Download as Raw

Laman

stone detection enhanced by edge subtraction

a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
1d1fd2ae49bf
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
a1e2cc709d53
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
1d1fd2ae49bf

import os
import sys
import random
import time
import urllib.error
import urllib.request

from lxml import html


targetDir=sys.argv[1]
linksFile=sys.argv[2]
with open(linksFile) as f:
	links=f.readlines()

for link in links:
	link=link.strip()
	if link.startswith("#"): continue
	time.sleep(1+2*random.random())
	try:
		with urllib.request.urlopen(link) as req:
			if req.getcode()!=200:
				print("error when downloading {0}".format(link),file=sys.stderr)
				continue
			doc_str=req.read().decode("utf8")
	except urllib.error.HTTPError:
		print("error when downloading {0}".format(link),file=sys.stderr)
		continue

	doc=html.fromstring(doc_str)

	try:
		license=doc.xpath("//div[@class='photo-license-info']//span/text()")[0].strip()
		imgUrl=doc.xpath("//meta[@property='og:image']/@content")[0]
	except IndexError:
		print("error when parsing {0}".format(link),file=sys.stderr)
		continue
	filename=imgUrl.split("/")[-1].split("?")[0]

	with urllib.request.urlopen(imgUrl) as req:
		if req.getcode()!=200:
			print("error when downloading {0}".format(link),file=sys.stderr)
			continue
		with open(os.path.join(targetDir,filename),mode="wb") as f:
			f.write(req.read())

	print("\t".join(map(str, [link,filename,license])), flush=True)