OneEye Files · exp/collector.py

Files @ 1d1fd2ae49bf

Branch filter:

Location: OneEye/exp/collector.py

1d1fd2ae49bf 1.2 KiB text/x-python Show Annotation Show as Raw Download as Raw

Laman

collector improved

import os
import sys
import random
import time
import urllib.error
import urllib.request

from lxml import html


targetDir=sys.argv[1]
linksFile=sys.argv[2]
with open(linksFile) as f:
	links=f.readlines()

for link in links:
	link=link.strip()
	if link.startswith("#"): continue
	time.sleep(1+2*random.random())
	try:
		with urllib.request.urlopen(link) as req:
			if req.getcode()!=200:
				print("error when downloading {0}".format(link),file=sys.stderr)
				continue
			doc_str=req.read().decode("utf8")
	except urllib.error.HTTPError:
		print("error when downloading {0}".format(link),file=sys.stderr)
		continue

	doc=html.fromstring(doc_str)

	try:
		license=doc.xpath("//div[@class='photo-license-info']//span/text()")[0].strip()
		imgUrl=doc.xpath("//meta[@property='og:image']/@content")[0]
	except IndexError:
		print("error when parsing {0}".format(link),file=sys.stderr)
		continue
	filename=imgUrl.split("/")[-1].split("?")[0]

	with urllib.request.urlopen(imgUrl) as req:
		if req.getcode()!=200:
			print("error when downloading {0}".format(link),file=sys.stderr)
			continue
		with open(os.path.join(targetDir,filename),mode="wb") as f:
			f.write(req.read())

	print("\t".join(map(str, [link,filename,license])), flush=True)