OneEye Files · exp/collector.py

Files @ f1f8a2421f92

Branch filter:

Location: OneEye/exp/collector.py - annotation

f1f8a2421f92 1.2 KiB text/x-python Show Source Show as Raw Download as Raw

Laman

updated readme

a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
1d1fd2ae49bf
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
a1e2cc709d53
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
1d1fd2ae49bf

import os
import sys
import random
import time
import urllib.error
import urllib.request

from lxml import html


targetDir=sys.argv[1]
linksFile=sys.argv[2]
with open(linksFile) as f:
	links=f.readlines()

for link in links:
	link=link.strip()
	if link.startswith("#"): continue
	time.sleep(1+2*random.random())
	try:
		with urllib.request.urlopen(link) as req:
			if req.getcode()!=200:
				print("error when downloading {0}".format(link),file=sys.stderr)
				continue
			doc_str=req.read().decode("utf8")
	except urllib.error.HTTPError:
		print("error when downloading {0}".format(link),file=sys.stderr)
		continue

	doc=html.fromstring(doc_str)

	try:
		license=doc.xpath("//div[@class='photo-license-info']//span/text()")[0].strip()
		imgUrl=doc.xpath("//meta[@property='og:image']/@content")[0]
	except IndexError:
		print("error when parsing {0}".format(link),file=sys.stderr)
		continue
	filename=imgUrl.split("/")[-1].split("?")[0]

	with urllib.request.urlopen(imgUrl) as req:
		if req.getcode()!=200:
			print("error when downloading {0}".format(link),file=sys.stderr)
			continue
		with open(os.path.join(targetDir,filename),mode="wb") as f:
			f.write(req.read())

	print("\t".join(map(str, [link,filename,license])), flush=True)