Changeset - 1d1fd2ae49bf
[Not reviewed]
default
0 1 0
Laman - 6 years ago 2018-12-20 22:03:01

collector improved
1 file changed with 26 insertions and 12 deletions:
0 comments (0 inline, 0 general)
exp/collector.py
Show inline comments
 
@@ -2,26 +2,40 @@ import os
 
import sys
 
import random
 
import time
 
import urllib.error
 
import urllib.request
 

	
 
from lxml import html
 

	
 

	
 
targetDir=""
 
links=sys.argv
 
targetDir=sys.argv[1]
 
linksFile=sys.argv[2]
 
with open(linksFile) as f:
 
	links=f.readlines()
 

	
 
for link in links:
 
	time.sleep(1+random.random())
 
	with urllib.request.urlopen(link) as req:
 
		if req.getcode()!=200:
 
			print("error when downloading {0}".format(link),file=sys.stderr)
 
			continue
 
		doc_str=req.read().decode("utf8")
 
	link=link.strip()
 
	if link.startswith("#"): continue
 
	time.sleep(1+2*random.random())
 
	try:
 
		with urllib.request.urlopen(link) as req:
 
			if req.getcode()!=200:
 
				print("error when downloading {0}".format(link),file=sys.stderr)
 
				continue
 
			doc_str=req.read().decode("utf8")
 
	except urllib.error.HTTPError:
 
		print("error when downloading {0}".format(link),file=sys.stderr)
 
		continue
 

	
 
	doc=html.fromstring(doc_str)
 

	
 
	license=doc.xpath("//div[@class='photo-license-info']//span/text()")[0].strip()
 
	imgUrl=doc.xpath("//meta[@property='og:image']/@content")[0]
 
	filename=imgUrl.split("/")[-1]
 
	try:
 
		license=doc.xpath("//div[@class='photo-license-info']//span/text()")[0].strip()
 
		imgUrl=doc.xpath("//meta[@property='og:image']/@content")[0]
 
	except IndexError:
 
		print("error when parsing {0}".format(link),file=sys.stderr)
 
		continue
 
	filename=imgUrl.split("/")[-1].split("?")[0]
 

	
 
	with urllib.request.urlopen(imgUrl) as req:
 
		if req.getcode()!=200:
 
@@ -30,4 +44,4 @@ for link in links:
 
		with open(os.path.join(targetDir,filename),mode="wb") as f:
 
			f.write(req.read())
 

	
 
	print("\t".join(map(str, [link,filename,license])))
 
	print("\t".join(map(str, [link,filename,license])), flush=True)
0 comments (0 inline, 0 general)