commit ea4c1389ba7fd3f77160ea68d0e56e510e42a15b
parent f90e3c0d418e0fd618ef8a511a2864581e8f6cde
Author: Georges Dupéron <jahvascriptmaniac+github@free.fr>
Date: Thu, 22 Sep 2011 12:16:50 +0200
Move database entries for deleted files to a separate history table.
Diffstat:
2 files changed, 18 insertions(+), 1 deletion(-)
diff --git a/updatehash.py b/updatehash.py
@@ -28,6 +28,7 @@ def initdb(cursor):
cursor.execute("create table if not exists files(tag,timestamp,path primary key,md5,sha1,mtime,size)")
cursor.execute("create index if not exists i_files_tag on files(tag)")
cursor.execute("create index if not exists i_files_path_md5_sha1 on files(path,md5,sha1)")
+ cursor.execute("create table removedfiles(rmtimestamp,tag,timestamp,path,md5,sha1,mtime,size)")
def cacheFileInfo(cursor, path):
cursor.execute('select mtime,size from files where path = ?', (path,))
@@ -35,6 +36,8 @@ def cacheFileInfo(cursor, path):
return data and {'mtime':data[0], 'size':data[1]}
def update(connection,cursor,path):
+ cursor.execute("create temp table newfiles(path)")
+ cursor.execute("create index i_newfiles_path on newfiles(path)")
timestamp = time.time()
currentTime = time.clock()
lastTime = currentTime
@@ -48,6 +51,7 @@ def update(connection,cursor,path):
print "!skipping", fpath
continue
cfi = cacheFileInfo(cursor,fpath)
+ cursor.execute("insert into newfiles(path) values(?)", (fpath,))
if fi != cfi:
print " updating", fpath
sums = checksumFile(fpath)
@@ -59,6 +63,12 @@ def update(connection,cursor,path):
lastTime = currentTime
connection.commit()
print "commit!"
+ connection.commit()
+ print "commit!"
+ print "cleaning up..."
+ cursor.execute("insert into removedfiles(rmtimestamp,tag,timestamp,path,md5,sha1,mtime,size) select ?,tag,timestamp,path,md5,sha1,mtime,size from files where path not in newfiles", (timestamp,))
+ cursor.execute("delete from files where path not in (select path from newfiles)")
+ connection.commit()
def walk(db,path):
connection = sqlite3.connect(db)
@@ -66,7 +76,6 @@ def walk(db,path):
cursor = connection.cursor()
initdb(cursor)
update(connection, cursor, path)
- connection.commit()
cursor.close()
def help():
diff --git a/updatehash.sql b/updatehash.sql
@@ -0,0 +1,8 @@
+# Size of duplicates that can be removed (doesn't count the size of the copy we leave)
+select round((B.tot-A.tot)/(1024.*1024.*1024.),2)||' Gb' from (select sum(size) as tot from (select distinct md5,sha1,size from files)) as A, (select sum(size) as tot from (select md5,sha1,size from files)) as B;
+
+# List of duplicates (all copies)
+select size,path from files where md5||'#'||sha1||'#'||size in (select md5||'#'||sha1||'#'||size from files group by md5,sha1,size having count(path) > 1) order by size;
+
+# Total count of files and total weight in Gb
+select round(sum(size)/(1024.*1024.*1024.),2)||' Gb '||count(size)||' files' from files;