Changeset - a1e2cc709d53
[Not reviewed]
default
0 0 1
Laman - 6 years ago 2018-12-15 09:20:11

photo collector
1 file changed with 33 insertions and 0 deletions:
0 comments (0 inline, 0 general)
exp/collector.py
Show inline comments
 
new file 100644
 
import os
 
import sys
 
import random
 
import time
 
import urllib.request
 

	
 
from lxml import html
 

	
 

	
 
targetDir=""
 
links=sys.argv
 
for link in links:
 
	time.sleep(1+random.random())
 
	with urllib.request.urlopen(link) as req:
 
		if req.getcode()!=200:
 
			print("error when downloading {0}".format(link),file=sys.stderr)
 
			continue
 
		doc_str=req.read().decode("utf8")
 

	
 
	doc=html.fromstring(doc_str)
 

	
 
	license=doc.xpath("//div[@class='photo-license-info']//span/text()")[0].strip()
 
	imgUrl=doc.xpath("//meta[@property='og:image']/@content")[0]
 
	filename=imgUrl.split("/")[-1]
 

	
 
	with urllib.request.urlopen(imgUrl) as req:
 
		if req.getcode()!=200:
 
			print("error when downloading {0}".format(link),file=sys.stderr)
 
			continue
 
		with open(os.path.join(targetDir,filename),mode="wb") as f:
 
			f.write(req.read())
 

	
 
	print("\t".join(map(str, [link,filename,license])))
0 comments (0 inline, 0 general)