@@ -4,6 +4,7 @@ import datetime
import sqlite3
import math
import pathlib
import itertools
BLOCK_SIZE=4096
HASH_LEN=8
@@ -24,10 +25,16 @@ def initDB():
c.execute("""create index if not exists `dirty_index` on `hashes` (`dirty`)""")
conn.commit()
def initHashes(fileName, hashFile):
with open(hashFile, mode="bw") as f:
for chunkHash in chunks(fileName):
f.write(chunkHash)
def initHashes(fileName):
initDB()
with sqlite3.connect(DB) as db:
handle=db.cursor()
handle.executemany(
"""insert into `hashes` (`sector_id`,`sector_hash`,`dirty`) values (:id,:hash,1)""",
({"id":i,"hash":dataHash} for (i,dataHash) in enumerate(chunks(fileName)))
)
db.commit()
def chunks(fileName):
with open(fileName, mode="br") as f:
@@ -36,12 +43,7 @@ def chunks(fileName):
yield hashlib.sha256(data).digest()[:HASH_LEN]
data=f.read(BLOCK_SIZE)
def hashes(filename):
# with open(filename, mode="br") as f:
# hashBytes=f.read(HASH_LEN)
# while hashBytes:
# yield hashBytes
def hashes():
handle.execute("""select `sector_hash` from `hashes` order by `sector_id` asc""")
@@ -50,15 +52,16 @@ def hashes(filename):
yield h
h=handle.fetchone()
def compare(fileName, hashFile):
def findChanges(fileName):
changelist=[]
# build changelist
# can use up to (HASH_LEN+size(int))*(filesize/BLOCK_SIZE) bytes of memory plus overhead
# that's (8+4)*(8*2**30 / 4096) = 24MB for defaults
for (i,(dataHash,savedHash)) in enumerate(zip(chunks(fileName),hashes(hashFile))):
for (i,(dataHash,savedHash)) in enumerate(itertools.zip_longest(chunks(fileName),hashes())):
if dataHash!=savedHash:
changelist.append((i,dataHash))
if dataHash is None: break # shouldn't happen
# write log
with open(logFile, mode="w") as f:
@@ -69,41 +72,38 @@ def compare(fileName, hashFile):
# update DB
handle.executemany("""update `hashes` set `sector_hash`=':hash', `dirty`=1 where `sector_id`=:id""",
{"sector_id":i,"sector_hash":dataHash} for (i,dataHash) in changelist)
"""update `hashes` set `sector_hash`=:hash, `dirty`=1 where `sector_id`=:id""",
({"id":i,"hash":dataHash} for (i,dataHash) in changelist)
# update hashFile
# with open(hashFile, mode="r+b") as f:
# for (i,dataHash) in changelist:
# f.seek(i*HASH_LEN)
# f.write(dataHash)
def transferChanges():
def transferChanges(targetPath):
# read changes
with sqlite3.connect(DB) as db, open(fileName,mode="rb") as sf:
handle.execute("""select `hash_id` from `hashes` where `dirty`=1""")
handle.execute("""select `sector_id` from `hashes` where `dirty`=1""")
# transfer modified sectors and mark them as clean
sectorIds=handle.fetchall()
for sectorId in sectorIds:
path=getPath(sectorId)
path.parent.mkdir(parents=True,exist_ok=True)
for (sectorId,) in sectorIds:
path=targetPath / getPath(sectorId)
try: path.parent.mkdir(parents=True)
except FileExistsError: pass
df=path.open(mode="wb")
sf.seek(sectorId)
df.write(sf.read(BLOCK_SIZE))
handle.execute("""update `hashes` set `dirty`=0 where `hash_id`=?""",sectorId)
handle.execute("""update `hashes` set `dirty`=0 where `sector_id`=?""",(sectorId,))
def getPath(index):
nodeIds=[]
k=1
for i in range(math.ceil(math.log(FILE_COUNT)/math.log(k))):
while k<=FILE_COUNT:
nodeIds.append(index//k)
k*=FILES_PER_DIR
nodeIds.reverse()
return pathlib.Path(*nodeIds)
return pathlib.Path(*[str(id) for id in nodeIds])
action=sys.argv[1]
fileName=sys.argv[2]
@@ -112,7 +112,8 @@ hashFile="{0}-hash.dat".format(baseName)
isoDate=datetime.datetime.now().strftime("%Y%m%dT%H%M%S")
logFile="{0}-{1}.log".format(baseName,isoDate)
if action=="init": initHashes(fileName, hashFile)
elif action=="update": compare(fileName, hashFile)
if action=="init": initHashes(fileName)
elif action=="update": findChanges(fileName)
elif action=="transfer": transferChanges(sys.argv[3])
else: print("bad action")
Status change: