updatehash.py (6786B)
1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 import hashlib 4 import os 5 import sqlite3 6 import time 7 import sys 8 import stat 9 import math 10 import threading, Queue 11 12 # Common functions 13 14 def removePrefix(fileName): 15 while fileName[0:2] == ".%": 16 fileName = fileName[2:] 17 return fileName 18 19 def removePrefixPath(path): 20 return '/'.join([removePrefix(component) for component in path.split('/')]) 21 22 # Code for this utility 23 24 md5Jobs = Queue.Queue(4) 25 sha1Jobs = Queue.Queue(4) 26 # md5Thread is defined below 27 # sha1Thread is defined below 28 processedFilesCount = 0 29 updatedFilesCount = 0 30 skippedFilesCount = 0 31 processedFoldersCount = 0 32 33 class checksumThread(threading.Thread): 34 def __init__(self, hashlibObjectBuilder, jobsQueue): 35 threading.Thread.__init__(self) 36 self.hashlibObjectBuilder = hashlibObjectBuilder 37 self.hashlibObject = hashlibObjectBuilder() 38 self.jobsQueue = jobsQueue 39 self.isAlive = True 40 def run(self): 41 while self.isAlive: 42 chunk = self.jobsQueue.get(block = True) 43 if chunk is not None: 44 self.hashlibObject.update(chunk) 45 self.jobsQueue.task_done() 46 def stop(self): 47 self.isAlive = False 48 # Note: Injecting a string in the queue is a bad idea since it would change the checksum 49 self.jobsQueue.put(None) 50 def getSum(self): 51 self.jobsQueue.join() # Wait until all chunks sent until this point are processed. 52 sum = self.hashlibObject.hexdigest() 53 self.hashlibObject = self.hashlibObjectBuilder() 54 return sum 55 56 multithread = True 57 if multithread: 58 def checksumFile(path): 59 md5 = hashlib.md5() 60 sha1 = hashlib.sha1() 61 with open(path,'rb') as f: 62 while True: 63 chunk = f.read(2*md5.block_size*sha1.block_size) 64 if not chunk: 65 return {'md5':md5.hexdigest(), 'sha1':sha1.hexdigest()} 66 md5.update(chunk) 67 sha1.update(chunk) 68 else: 69 def checksumFile(path): 70 with open(path,'rb') as f: 71 while True: 72 chunk = f.read(1048576) # 1 Megabyte 73 if not chunk: 74 return {'md5':md5Thread.getSum(), 'sha1':sha1Thread.getSum()} 75 md5Jobs.put(chunk) 76 sha1Jobs.put(chunk) 77 78 def fileInfo(path): 79 st = os.lstat(path) 80 if not stat.S_ISREG(st.st_mode): 81 return None 82 return {'mtime':st.st_mtime, 'size':st.st_size} 83 84 def initdb(cursor): 85 cursor.execute("create table if not exists files(timestamp,path primary key,md5,sha1,mtime,size)") 86 cursor.execute("create index if not exists i_files_path_md5_sha1 on files(path,md5,sha1)") 87 cursor.execute("create table if not exists removedfiles(rmtimestamp,timestamp,path,md5,sha1,mtime,size)") 88 89 def cacheFileInfo(cursor, path): 90 cursor.execute('select mtime,size from files where path = ?', (path,)) 91 data = cursor.fetchone() 92 return data and {'mtime':data[0], 'size':data[1]} 93 94 def update(connection,cursor,path): 95 global processedFilesCount 96 global processedFoldersCount 97 global updatedFilesCount 98 global skippedFilesCount 99 100 cursor.execute("create temp table newfiles(path)") 101 cursor.execute("create index i_newfiles_path on newfiles(path)") 102 timestamp = time.time() 103 currentTime = time.clock() 104 lastTime = currentTime 105 for d in os.walk(path): 106 dirpath=d[0] 107 processedFoldersCount += 1 108 for f in d[2]: 109 prefixPath = os.path.join(dirpath, f) 110 if os.path.isfile(prefixPath): 111 processedFilesCount += 1 112 fi = fileInfo(prefixPath) 113 if fi is None: 114 skippedFilesCount +=1 115 print "!skipping: no fileinfo: ", prefixPath 116 continue 117 fpath = removePrefixPath(prefixPath) 118 if fpath != prefixPath and os.path.exists(fpath): 119 skippedFilesCount +=1 120 print "!skipping: collision between '%s' and '%s'" % (prefixPath, fpath,) 121 continue 122 cfi = cacheFileInfo(cursor,fpath) 123 cursor.execute("insert into newfiles(path) values(?)", (fpath,)) 124 if fi != cfi: 125 updatedFilesCount += 1 126 if fpath != prefixPath: 127 print " updating %s (%s)" % (prefixPath, fpath,) 128 else: 129 print " updating %s" % (fpath,) 130 sums = checksumFile(prefixPath) 131 values = (timestamp,fpath,sums['md5'],sums['sha1'],fi['mtime'],fi['size']) 132 cursor.execute("insert or replace into files(timestamp,path,md5,sha1,mtime,size) values(?,?,?,?,?,?)", values) 133 134 currentTime = time.clock() 135 if abs(lastTime-currentTime) >= 10: 136 lastTime = currentTime 137 connection.commit() 138 print "commit!" 139 connection.commit() 140 print "commit!" 141 142 print "cleaning up..." 143 likepath=((path + '') if (path[-1:] == '/') else (path + '/')).replace('%', '%%') + '%'; 144 cursor.execute("create temp table deletedfiles(path)") 145 cursor.execute("create index i_deletedfiles_path on deletedfiles(path)") 146 cursor.execute("insert into deletedfiles(path) select path from files where path like ?", (likepath,)); 147 148 nbFilesBefore = cursor.execute("select count(*) from deletedfiles").fetchone()[0]; 149 nbFilesAfter = cursor.execute("select count(*) from newfiles").fetchone()[0]; 150 print 'number of files before: ', nbFilesBefore 151 print 'number of files after: ', nbFilesAfter 152 153 cursor.execute("delete from deletedfiles where path in newfiles"); 154 nbFilesDelete = cursor.execute("select count(*) from deletedfiles").fetchone()[0]; 155 print 'number of files to remove from database (moved in table removedfiles): ', nbFilesDelete 156 157 if (nbFilesAfter < math.ceil(nbFilesBefore * 0.5)): 158 print "!!! Not deleting hashes from database: there are less than 50% files after. Did you forget to mount your harddisk?" 159 else: 160 cursor.execute("insert into removedfiles(rmtimestamp,timestamp,path,md5,sha1,mtime,size)" 161 + " select ?,timestamp,path,md5,sha1,mtime,size from files where path in deletedfiles", (timestamp,)) 162 cursor.execute("delete from files where path in deletedfiles") 163 164 connection.commit() 165 166 def walk(db,path): 167 connection = sqlite3.connect(db) 168 connection.text_factory = str # For utf-8 file names… 169 cursor = connection.cursor() 170 initdb(cursor) 171 update(connection, cursor, path) 172 cursor.close() 173 174 def help(): 175 print 'Usage : %s database-file directory' % sys.argv[0] 176 sys.exit(1) 177 178 if len(sys.argv) != 3: 179 help() 180 for arg in sys.argv[1:]: 181 if arg == '-h' or arg == '--help': 182 help() 183 184 # Start threads and walk the filesystem 185 currentTime = time.time() 186 md5Thread = checksumThread(hashlib.md5(), md5Jobs); 187 md5Thread.start() 188 sha1Thread = checksumThread(hashlib.sha1(), sha1Jobs); 189 sha1Thread.start() 190 walk(sys.argv[1], sys.argv[2]) 191 md5Thread.stop() 192 sha1Thread.stop() 193 elapsedTime = time.time()-currentTime 194 elapsedTime = round(elapsedTime,3) 195 196 # Statistics 197 print '\n== Result ================================' 198 if elapsedTime > 1: 199 print ' Total elapsed time: ', format(elapsedTime), ' seconds' 200 else: 201 print ' Total elapsed time: ', format(elapsedTime), ' second' 202 print ' Processed files:', format(processedFilesCount) 203 print ' Processed folders:', format(processedFoldersCount) 204 print ' Updated files:', format(updatedFilesCount) 205 print ' Skipped files:', format(skippedFilesCount)