diff --git a/work.py b/work.py --- a/work.py +++ b/work.py @@ -4,6 +4,7 @@ import datetime import sqlite3 import math import pathlib +import itertools BLOCK_SIZE=4096 HASH_LEN=8 @@ -24,10 +25,16 @@ def initDB(): c.execute("""create index if not exists `dirty_index` on `hashes` (`dirty`)""") conn.commit() -def initHashes(fileName, hashFile): - with open(hashFile, mode="bw") as f: - for chunkHash in chunks(fileName): - f.write(chunkHash) +def initHashes(fileName): + initDB() + + with sqlite3.connect(DB) as db: + handle=db.cursor() + handle.executemany( + """insert into `hashes` (`sector_id`,`sector_hash`,`dirty`) values (:id,:hash,1)""", + ({"id":i,"hash":dataHash} for (i,dataHash) in enumerate(chunks(fileName))) + ) + db.commit() def chunks(fileName): with open(fileName, mode="br") as f: @@ -36,12 +43,7 @@ def chunks(fileName): yield hashlib.sha256(data).digest()[:HASH_LEN] data=f.read(BLOCK_SIZE) -def hashes(filename): -# with open(filename, mode="br") as f: -# hashBytes=f.read(HASH_LEN) -# while hashBytes: -# yield hashBytes -# hashBytes=f.read(HASH_LEN) +def hashes(): with sqlite3.connect(DB) as db: handle=db.cursor() handle.execute("""select `sector_hash` from `hashes` order by `sector_id` asc""") @@ -50,15 +52,16 @@ def hashes(filename): yield h h=handle.fetchone() -def compare(fileName, hashFile): +def findChanges(fileName): changelist=[] # build changelist # can use up to (HASH_LEN+size(int))*(filesize/BLOCK_SIZE) bytes of memory plus overhead # that's (8+4)*(8*2**30 / 4096) = 24MB for defaults - for (i,(dataHash,savedHash)) in enumerate(zip(chunks(fileName),hashes(hashFile))): + for (i,(dataHash,savedHash)) in enumerate(itertools.zip_longest(chunks(fileName),hashes())): if dataHash!=savedHash: changelist.append((i,dataHash)) + if dataHash is None: break # shouldn't happen # write log with open(logFile, mode="w") as f: @@ -69,41 +72,38 @@ def compare(fileName, hashFile): # update DB with sqlite3.connect(DB) as db: handle=db.cursor() - handle.executemany("""update `hashes` set `sector_hash`=':hash', `dirty`=1 where `sector_id`=:id""", - {"sector_id":i,"sector_hash":dataHash} for (i,dataHash) in changelist) + handle.executemany( + """update `hashes` set `sector_hash`=:hash, `dirty`=1 where `sector_id`=:id""", + ({"id":i,"hash":dataHash} for (i,dataHash) in changelist) + ) db.commit() - - # update hashFile -# with open(hashFile, mode="r+b") as f: -# for (i,dataHash) in changelist: -# f.seek(i*HASH_LEN) -# f.write(dataHash) -def transferChanges(): +def transferChanges(targetPath): # read changes with sqlite3.connect(DB) as db, open(fileName,mode="rb") as sf: handle=db.cursor() - handle.execute("""select `hash_id` from `hashes` where `dirty`=1""") + handle.execute("""select `sector_id` from `hashes` where `dirty`=1""") # transfer modified sectors and mark them as clean sectorIds=handle.fetchall() - for sectorId in sectorIds: - path=getPath(sectorId) - path.parent.mkdir(parents=True,exist_ok=True) + for (sectorId,) in sectorIds: + path=targetPath / getPath(sectorId) + try: path.parent.mkdir(parents=True) + except FileExistsError: pass df=path.open(mode="wb") sf.seek(sectorId) df.write(sf.read(BLOCK_SIZE)) - handle.execute("""update `hashes` set `dirty`=0 where `hash_id`=?""",sectorId) + handle.execute("""update `hashes` set `dirty`=0 where `sector_id`=?""",(sectorId,)) db.commit() def getPath(index): nodeIds=[] k=1 - for i in range(math.ceil(math.log(FILE_COUNT)/math.log(k))): + while k<=FILE_COUNT: nodeIds.append(index//k) k*=FILES_PER_DIR nodeIds.reverse() - return pathlib.Path(*nodeIds) + return pathlib.Path(*[str(id) for id in nodeIds]) action=sys.argv[1] fileName=sys.argv[2] @@ -112,7 +112,8 @@ hashFile="{0}-hash.dat".format(baseName) isoDate=datetime.datetime.now().strftime("%Y%m%dT%H%M%S") logFile="{0}-{1}.log".format(baseName,isoDate) -if action=="init": initHashes(fileName, hashFile) -elif action=="update": compare(fileName, hashFile) +if action=="init": initHashes(fileName) +elif action=="update": findChanges(fileName) +elif action=="transfer": transferChanges(sys.argv[3]) else: print("bad action")