OneEye Files · exp/collector.py

Files @ 29f28718a69b

Branch filter:

Location: OneEye/exp/collector.py - annotation

29f28718a69b 1.2 KiB text/x-python Show Source Show as Raw Download as Raw

Laman

transitional data processing

a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
1d1fd2ae49bf
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
a1e2cc709d53
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
1d1fd2ae49bf
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
a1e2cc709d53
1d1fd2ae49bf

import os
import sys
import random
import time
import urllib.error
import urllib.request

from lxml import html


targetDir=sys.argv[1]
linksFile=sys.argv[2]
with open(linksFile) as f:
	links=f.readlines()

for link in links:
	link=link.strip()
	if link.startswith("#"): continue
	time.sleep(1+2*random.random())
	try:
		with urllib.request.urlopen(link) as req:
			if req.getcode()!=200:
				print("error when downloading {0}".format(link),file=sys.stderr)
				continue
			doc_str=req.read().decode("utf8")
	except urllib.error.HTTPError:
		print("error when downloading {0}".format(link),file=sys.stderr)
		continue

	doc=html.fromstring(doc_str)

	try:
		license=doc.xpath("//div[@class='photo-license-info']//span/text()")[0].strip()
		imgUrl=doc.xpath("//meta[@property='og:image']/@content")[0]
	except IndexError:
		print("error when parsing {0}".format(link),file=sys.stderr)
		continue
	filename=imgUrl.split("/")[-1].split("?")[0]

	with urllib.request.urlopen(imgUrl) as req:
		if req.getcode()!=200:
			print("error when downloading {0}".format(link),file=sys.stderr)
			continue
		with open(os.path.join(targetDir,filename),mode="wb") as f:
			f.write(req.read())

	print("\t".join(map(str, [link,filename,license])), flush=True)