www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README

updatehash.py (6786B)


      1 #!/usr/bin/env python
      2 # -*- coding: utf-8 -*-
      3 import hashlib
      4 import os
      5 import sqlite3
      6 import time
      7 import sys
      8 import stat
      9 import math
     10 import threading, Queue
     11 
     12 # Common functions
     13 
     14 def removePrefix(fileName):
     15 	while fileName[0:2] == ".%":
     16 		fileName = fileName[2:]
     17 	return fileName
     18 
     19 def removePrefixPath(path):
     20 	return '/'.join([removePrefix(component) for component in path.split('/')])
     21 
     22 # Code for this utility
     23 
     24 md5Jobs = Queue.Queue(4)
     25 sha1Jobs = Queue.Queue(4)
     26 # md5Thread is defined below
     27 # sha1Thread is defined below
     28 processedFilesCount = 0
     29 updatedFilesCount = 0
     30 skippedFilesCount = 0
     31 processedFoldersCount = 0
     32 
     33 class checksumThread(threading.Thread):
     34 	def __init__(self, hashlibObjectBuilder, jobsQueue):
     35 		threading.Thread.__init__(self)
     36 		self.hashlibObjectBuilder = hashlibObjectBuilder
     37 		self.hashlibObject = hashlibObjectBuilder()
     38 		self.jobsQueue = jobsQueue
     39 		self.isAlive = True
     40 	def run(self):
     41 		while self.isAlive:
     42 			chunk = self.jobsQueue.get(block = True)
     43 			if chunk is not None:
     44 				self.hashlibObject.update(chunk)
     45 			self.jobsQueue.task_done()
     46 	def stop(self):
     47 		self.isAlive = False
     48 		# Note: Injecting a string in the queue is a bad idea since it would change the checksum
     49 		self.jobsQueue.put(None)
     50 	def getSum(self):
     51 		self.jobsQueue.join() # Wait until all chunks sent until this point are processed.
     52 		sum = self.hashlibObject.hexdigest()
     53 		self.hashlibObject = self.hashlibObjectBuilder()
     54 		return sum
     55 
     56 multithread = True
     57 if multithread:
     58 	def checksumFile(path):
     59 		md5 = hashlib.md5()
     60 		sha1 = hashlib.sha1()
     61 		with open(path,'rb') as f: 
     62 			while True:
     63 				chunk = f.read(2*md5.block_size*sha1.block_size)
     64 				if not chunk:
     65 					return {'md5':md5.hexdigest(), 'sha1':sha1.hexdigest()}
     66 				md5.update(chunk)
     67 				sha1.update(chunk)
     68 else:
     69 	def checksumFile(path):
     70 		with open(path,'rb') as f: 
     71 			while True:
     72 				chunk = f.read(1048576) # 1 Megabyte
     73 				if not chunk:
     74 					return {'md5':md5Thread.getSum(), 'sha1':sha1Thread.getSum()}
     75 				md5Jobs.put(chunk)
     76 				sha1Jobs.put(chunk)
     77 
     78 def fileInfo(path):
     79 	st = os.lstat(path)
     80 	if not stat.S_ISREG(st.st_mode):
     81 		return None
     82 	return {'mtime':st.st_mtime, 'size':st.st_size}
     83 
     84 def initdb(cursor):
     85 	cursor.execute("create table if not exists files(timestamp,path primary key,md5,sha1,mtime,size)")
     86 	cursor.execute("create index if not exists i_files_path_md5_sha1 on files(path,md5,sha1)")
     87 	cursor.execute("create table if not exists removedfiles(rmtimestamp,timestamp,path,md5,sha1,mtime,size)")
     88 
     89 def cacheFileInfo(cursor, path):
     90 	cursor.execute('select mtime,size from files where path = ?', (path,))
     91 	data = cursor.fetchone()
     92 	return data and {'mtime':data[0], 'size':data[1]}
     93 
     94 def update(connection,cursor,path):
     95 	global processedFilesCount
     96 	global processedFoldersCount
     97 	global updatedFilesCount
     98 	global skippedFilesCount
     99 	
    100 	cursor.execute("create temp table newfiles(path)")
    101 	cursor.execute("create index i_newfiles_path on newfiles(path)")
    102 	timestamp = time.time()
    103 	currentTime = time.clock()
    104 	lastTime = currentTime
    105 	for d in os.walk(path):
    106 		dirpath=d[0]
    107 		processedFoldersCount += 1
    108 		for f in d[2]:
    109 			prefixPath = os.path.join(dirpath, f)
    110 			if os.path.isfile(prefixPath):
    111 				processedFilesCount += 1
    112 				fi = fileInfo(prefixPath)
    113 				if fi is None:
    114 					skippedFilesCount +=1
    115 					print "!skipping: no fileinfo: ", prefixPath
    116 					continue
    117 				fpath = removePrefixPath(prefixPath)
    118 				if fpath != prefixPath and os.path.exists(fpath):
    119 					skippedFilesCount +=1
    120 					print "!skipping: collision between '%s' and '%s'" % (prefixPath, fpath,)
    121 					continue
    122 				cfi = cacheFileInfo(cursor,fpath)
    123 				cursor.execute("insert into newfiles(path) values(?)", (fpath,))
    124 				if fi != cfi:
    125 					updatedFilesCount += 1
    126 					if fpath != prefixPath:
    127 						print " updating %s (%s)" % (prefixPath, fpath,)
    128 					else:
    129 						print " updating %s" % (fpath,)
    130 					sums = checksumFile(prefixPath)
    131 					values = (timestamp,fpath,sums['md5'],sums['sha1'],fi['mtime'],fi['size'])
    132 					cursor.execute("insert or replace into files(timestamp,path,md5,sha1,mtime,size) values(?,?,?,?,?,?)", values)
    133 					
    134 					currentTime = time.clock()
    135 					if abs(lastTime-currentTime) >= 10:
    136 						lastTime = currentTime
    137 						connection.commit()
    138 						print "commit!"
    139 	connection.commit()
    140 	print "commit!"
    141 
    142 	print "cleaning up..."
    143 	likepath=((path + '') if (path[-1:] == '/') else (path + '/')).replace('%', '%%') + '%';
    144 	cursor.execute("create temp table deletedfiles(path)")
    145 	cursor.execute("create index i_deletedfiles_path on deletedfiles(path)")
    146 	cursor.execute("insert into deletedfiles(path) select path from files where path like ?", (likepath,));
    147 
    148 	nbFilesBefore = cursor.execute("select count(*) from deletedfiles").fetchone()[0];
    149 	nbFilesAfter = cursor.execute("select count(*) from newfiles").fetchone()[0];
    150 	print 'number of files before: ', nbFilesBefore
    151 	print 'number of files after: ', nbFilesAfter
    152 
    153 	cursor.execute("delete from deletedfiles where path in newfiles");
    154 	nbFilesDelete = cursor.execute("select count(*) from deletedfiles").fetchone()[0];
    155 	print 'number of files to remove from database (moved in table removedfiles): ', nbFilesDelete
    156 	
    157 	if (nbFilesAfter < math.ceil(nbFilesBefore * 0.5)):
    158 		print "!!! Not deleting hashes from database: there are less than 50% files after. Did you forget to mount your harddisk?"
    159 	else:
    160 		cursor.execute("insert into removedfiles(rmtimestamp,timestamp,path,md5,sha1,mtime,size)"
    161 					   + " select ?,timestamp,path,md5,sha1,mtime,size from files where path in deletedfiles", (timestamp,))
    162 		cursor.execute("delete from files where path in deletedfiles")
    163 	
    164 	connection.commit()
    165 
    166 def walk(db,path):
    167 	connection = sqlite3.connect(db)
    168 	connection.text_factory = str # For utf-8 file names…
    169 	cursor = connection.cursor()
    170 	initdb(cursor)
    171 	update(connection, cursor, path)
    172 	cursor.close()
    173 
    174 def help():
    175 	print 'Usage : %s database-file directory' % sys.argv[0]
    176 	sys.exit(1)
    177 
    178 if len(sys.argv) != 3:
    179 	help()
    180 for arg in sys.argv[1:]:
    181 	if arg == '-h' or arg == '--help':
    182 		help()
    183 
    184 # Start threads and walk the filesystem
    185 currentTime = time.time()
    186 md5Thread = checksumThread(hashlib.md5(), md5Jobs);
    187 md5Thread.start()
    188 sha1Thread = checksumThread(hashlib.sha1(), sha1Jobs);
    189 sha1Thread.start()
    190 walk(sys.argv[1], sys.argv[2])
    191 md5Thread.stop()
    192 sha1Thread.stop()
    193 elapsedTime = time.time()-currentTime
    194 elapsedTime = round(elapsedTime,3)
    195 
    196 # Statistics
    197 print '\n== Result ================================'
    198 if elapsedTime > 1:
    199 	print '    Total elapsed time: ', format(elapsedTime), ' seconds'
    200 else:
    201 	print '    Total elapsed time: ', format(elapsedTime), ' second'
    202 print '    Processed files:', format(processedFilesCount)
    203 print '    Processed folders:', format(processedFoldersCount)
    204 print '    Updated files:', format(updatedFilesCount)
    205 print '    Skipped files:', format(skippedFilesCount)