Files @ a52fefe61468
Branch filter:

Location: Morevna/work.py

Laman
work in progress
import hashlib
import sys
import datetime
import sqlite3
import math
import pathlib

BLOCK_SIZE=4096
HASH_LEN=8
DB="morevna.db"
FILES_PER_DIR=100
FILE_COUNT=2**21

def initDB():
	conn=sqlite3.connect("morevna.db")

	c=conn.cursor()
	c.execute("""create table if not exists `hashes` (
		`sector_id` integer primary key,
		`sector_hash` blob not null,
		`dirty` integer check(`dirty` in (0,1))
	)""")
	
	c.execute("""create index if not exists `dirty_index` on `hashes` (`dirty`)""")
	conn.commit()

def initHashes(fileName, hashFile):
	with open(hashFile, mode="bw") as f:
		for chunkHash in chunks(fileName):
			f.write(chunkHash)

def chunks(fileName):
	with open(fileName, mode="br") as f:
		data=f.read(BLOCK_SIZE)
		while data:			
			yield hashlib.sha256(data).digest()[:HASH_LEN]
			data=f.read(BLOCK_SIZE)

def hashes(filename):
#	with open(filename, mode="br") as f:
#		hashBytes=f.read(HASH_LEN)
#		while hashBytes:
#			yield hashBytes
#			hashBytes=f.read(HASH_LEN)
	with sqlite3.connect(DB) as db:
		handle=db.cursor()
		handle.execute("""select `sector_hash` from `hashes` order by `sector_id` asc""")
		h=handle.fetchone()
		while h is not None:
			yield h
			h=handle.fetchone()

def compare(fileName, hashFile):
	changelist=[]

	# build changelist
	# can use up to (HASH_LEN+size(int))*(filesize/BLOCK_SIZE) bytes of memory plus overhead
	# that's (8+4)*(8*2**30 / 4096) = 24MB for defaults
	for (i,(dataHash,savedHash)) in enumerate(zip(chunks(fileName),hashes(hashFile))):
		if dataHash!=savedHash:
			changelist.append((i,dataHash))

	# write log
	with open(logFile, mode="w") as f:
		f.write("sector	hash\n")
		for (i,dataHash) in changelist:
			f.write("{0}\t{1}\n".format(i,dataHash))
			
	# update DB
	with sqlite3.connect(DB) as db:
		handle=db.cursor()
		handle.executemany("""update `hashes` set `sector_hash`=':hash', `dirty`=1 where `sector_id`=:id""",
			{"sector_id":i,"sector_hash":dataHash} for (i,dataHash) in changelist)
		db.commit()
			
	# update hashFile
#	with open(hashFile, mode="r+b") as f:
#		for (i,dataHash) in changelist:
#			f.seek(i*HASH_LEN)
#			f.write(dataHash)
	
def transferChanges():
	# read changes
	with sqlite3.connect(DB) as db, open(fileName,mode="rb") as sf:
		handle=db.cursor()
		handle.execute("""select `hash_id` from `hashes` where `dirty`=1""")
			
		# transfer modified sectors and mark them as clean
		sectorIds=handle.fetchall()
		for sectorId in sectorIds:
			path=getPath(sectorId)
			path.parent.mkdir(parents=True,exist_ok=True)
			df=path.open(mode="wb")
			sf.seek(sectorId)
			df.write(sf.read(BLOCK_SIZE))
			handle.execute("""update `hashes` set `dirty`=0 where `hash_id`=?""",sectorId)
			db.commit()

def getPath(index):
	nodeIds=[]
	k=1
	for i in range(math.ceil(math.log(FILE_COUNT)/math.log(k))):
		nodeIds.append(index//k)
		k*=FILES_PER_DIR
	nodeIds.reverse()
	return pathlib.Path(*nodeIds)

action=sys.argv[1]
fileName=sys.argv[2]
baseName=".".join(fileName.split(".")[:-1])
hashFile="{0}-hash.dat".format(baseName)
isoDate=datetime.datetime.now().strftime("%Y%m%dT%H%M%S")
logFile="{0}-{1}.log".format(baseName,isoDate)

if action=="init": initHashes(fileName, hashFile)
elif action=="update": compare(fileName, hashFile)
else: print("bad action")