Files
@ 72963904714e
Branch filter:
Location: Morevna/work.py - annotation
72963904714e
3.2 KiB
text/x-python
rewritten for db storage
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 | a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 72963904714e a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 72963904714e 72963904714e 72963904714e 72963904714e 72963904714e 72963904714e 72963904714e 72963904714e 72963904714e 72963904714e a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 72963904714e a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 72963904714e a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 72963904714e a52fefe61468 a52fefe61468 72963904714e a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 72963904714e 72963904714e 72963904714e 72963904714e a52fefe61468 a52fefe61468 72963904714e a52fefe61468 a52fefe61468 a52fefe61468 72963904714e a52fefe61468 a52fefe61468 a52fefe61468 72963904714e 72963904714e 72963904714e 72963904714e a52fefe61468 a52fefe61468 a52fefe61468 72963904714e a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 72963904714e a52fefe61468 a52fefe61468 a52fefe61468 72963904714e a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 a52fefe61468 72963904714e 72963904714e 72963904714e a52fefe61468 a52fefe61468 | import hashlib
import sys
import datetime
import sqlite3
import math
import pathlib
import itertools
BLOCK_SIZE=4096
HASH_LEN=8
DB="morevna.db"
FILES_PER_DIR=100
FILE_COUNT=2**21
def initDB():
conn=sqlite3.connect("morevna.db")
c=conn.cursor()
c.execute("""create table if not exists `hashes` (
`sector_id` integer primary key,
`sector_hash` blob not null,
`dirty` integer check(`dirty` in (0,1))
)""")
c.execute("""create index if not exists `dirty_index` on `hashes` (`dirty`)""")
conn.commit()
def initHashes(fileName):
initDB()
with sqlite3.connect(DB) as db:
handle=db.cursor()
handle.executemany(
"""insert into `hashes` (`sector_id`,`sector_hash`,`dirty`) values (:id,:hash,1)""",
({"id":i,"hash":dataHash} for (i,dataHash) in enumerate(chunks(fileName)))
)
db.commit()
def chunks(fileName):
with open(fileName, mode="br") as f:
data=f.read(BLOCK_SIZE)
while data:
yield hashlib.sha256(data).digest()[:HASH_LEN]
data=f.read(BLOCK_SIZE)
def hashes():
with sqlite3.connect(DB) as db:
handle=db.cursor()
handle.execute("""select `sector_hash` from `hashes` order by `sector_id` asc""")
h=handle.fetchone()
while h is not None:
yield h
h=handle.fetchone()
def findChanges(fileName):
changelist=[]
# build changelist
# can use up to (HASH_LEN+size(int))*(filesize/BLOCK_SIZE) bytes of memory plus overhead
# that's (8+4)*(8*2**30 / 4096) = 24MB for defaults
for (i,(dataHash,savedHash)) in enumerate(itertools.zip_longest(chunks(fileName),hashes())):
if dataHash!=savedHash:
changelist.append((i,dataHash))
if dataHash is None: break # shouldn't happen
# write log
with open(logFile, mode="w") as f:
f.write("sector hash\n")
for (i,dataHash) in changelist:
f.write("{0}\t{1}\n".format(i,dataHash))
# update DB
with sqlite3.connect(DB) as db:
handle=db.cursor()
handle.executemany(
"""update `hashes` set `sector_hash`=:hash, `dirty`=1 where `sector_id`=:id""",
({"id":i,"hash":dataHash} for (i,dataHash) in changelist)
)
db.commit()
def transferChanges(targetPath):
# read changes
with sqlite3.connect(DB) as db, open(fileName,mode="rb") as sf:
handle=db.cursor()
handle.execute("""select `sector_id` from `hashes` where `dirty`=1""")
# transfer modified sectors and mark them as clean
sectorIds=handle.fetchall()
for (sectorId,) in sectorIds:
path=targetPath / getPath(sectorId)
try: path.parent.mkdir(parents=True)
except FileExistsError: pass
df=path.open(mode="wb")
sf.seek(sectorId)
df.write(sf.read(BLOCK_SIZE))
handle.execute("""update `hashes` set `dirty`=0 where `sector_id`=?""",(sectorId,))
db.commit()
def getPath(index):
nodeIds=[]
k=1
while k<=FILE_COUNT:
nodeIds.append(index//k)
k*=FILES_PER_DIR
nodeIds.reverse()
return pathlib.Path(*[str(id) for id in nodeIds])
action=sys.argv[1]
fileName=sys.argv[2]
baseName=".".join(fileName.split(".")[:-1])
hashFile="{0}-hash.dat".format(baseName)
isoDate=datetime.datetime.now().strftime("%Y%m%dT%H%M%S")
logFile="{0}-{1}.log".format(baseName,isoDate)
if action=="init": initHashes(fileName)
elif action=="update": findChanges(fileName)
elif action=="transfer": transferChanges(sys.argv[3])
else: print("bad action")
|