diff --git a/find_duplicate_files/find_duplicate_files.py b/find_duplicate_files/find_duplicate_files.py index 49db942..f604add 100644 --- a/find_duplicate_files/find_duplicate_files.py +++ b/find_duplicate_files/find_duplicate_files.py @@ -12,6 +12,7 @@ import click import click_config_file from logging.handlers import SysLogHandler import zlib +import sqlite3 class find_duplicate_files: @@ -28,6 +29,8 @@ class find_duplicate_files: self.first_directory = first_directory self.second_directory = second_directory + self._init_db_cache() + first_files = self.recursive_scandir(self.first_directory) self._log.debug(f"Found {len(first_files)} files in first directory '{first_directory}'") second_files = self.recursive_scandir(self.second_directory) @@ -42,6 +45,26 @@ class find_duplicate_files: self._log.info(f"#File '{first_files[hash]}' is dupe with '{second_files[hash]}'.") self._log.info(f"rm '{first_files[hash]}'") + def _init_db_cache(self, cache_file='/var/cache/find_duplicate_files.cache.sql'): + self.cache_file = cache_file + self.cache_db = sqlite3.connect(self.cache_file) + self.cur = self.cache_db.cursor() + self.cur.execute("CREATE TABLE IF NOT EXIST files(hash, file)") + self.cache_db.commit() + + def _check_file_cache(self, file): + result = self.cur.execute(f"SELECT hash FROM files WHERE file = '{file}'") + row = result.fetchone() + if len(row) > 0: + return row[0] + else: + return False + + def _cache_file(self, file, hash): + result = self.cur.execute(f"INSERT INTO files hash, file VALUES ('{file}', '{hash}')") + self.cur.commit() + return result + def recursive_scandir(self, path, ignore_hidden_files=True): ''' Recursively scan a directory for files''' files = dict() @@ -49,10 +72,15 @@ class find_duplicate_files: for file in os.scandir(path): if not file.name.startswith('.'): if file.is_file(): - with open(file.path, 'rb') as file_pointer: - file_content = file_pointer.read() - hash = zlib.adler32(file_content) - files[hash] = file.path + check_cache = self._check_file_cache(file.path) + if check_cache: + files[check_cache] = file.path + else: + with open(file.path, 'rb') as file_pointer: + file_content = file_pointer.read() + hash = zlib.adler32(file_content) + files[hash] = file.path + self._cache_file(file.path, hash) elif file.is_dir(follow_symlinks=False): more_files = self.recursive_scandir( file.path,