diff --git a/find_duplicate_files/find_duplicate_files.py b/find_duplicate_files/find_duplicate_files.py index 192d1e7..a011b1b 100644 --- a/find_duplicate_files/find_duplicate_files.py +++ b/find_duplicate_files/find_duplicate_files.py @@ -8,21 +8,29 @@ import sys import os import logging -import click -import click_config_file from logging.handlers import SysLogHandler import zlib import sqlite3 import re +import click +import click_config_file -class find_duplicate_files: +class FindDuplicateFiles: + '''Find duplicate files''' - def __init__(self, debug_level, log_file, dummy, first_directory, second_directory, exclude, limit, output_file, delete_duplicates): + def __init__( + self, debug_level, log_file, dummy, first_directory, second_directory, + exclude, limit, output_file, delete_duplicates + ): ''' Initial function called when object is created ''' - self.config = dict() + self.config = {} self.config['debug_level'] = debug_level if log_file is None: - log_file = os.path.join(os.environ.get('HOME', os.environ.get('USERPROFILE', os.getcwd())), 'log', 'find_duplicate_files.log') + log_file = os.path.join( + os.environ.get('HOME', os.environ.get('USERPROFILE', os.getcwd())), + 'log', + 'FindDuplicateFiles.log' + ) self.config['log_file'] = log_file self._init_log() @@ -42,20 +50,24 @@ class find_duplicate_files: total = len(first_files) count = 0 - with open(self.output_file, 'w') as output_pointer: - for hash in first_files: + with open(self.output_file, 'w', encoding='utf-8') as output_pointer: + for file_hash in first_files: count += 1 self._log.info(f"# Checking file {count} of {total}") - if hash in second_files: - self._log.info(f"#File '{first_files[hash]}' is a duplicate of '{second_files[hash]}'.") + if file_hash in second_files: + self._log.info(f"#File '{first_files[file_hash]}' is a duplicate of '{second_files[file_hash]}'.") if delete_duplicates: - self._log.warning(f"Removed file '{first_files[hash]}', duplicate of '{second_files[hash]}'.") - os.remove(first_files[hash]) + self._log.warning(f"Removed file '{first_files[file_hash]}', duplicate of '{second_files[file_hash]}'.") + os.remove(first_files[file_hash]) else: - self._log.info(f"rm '{first_files[hash]}'") - output_pointer.write(f"rm '{first_files[hash]}'\n") + self._log.info(f"rm '{first_files[file_hash]}'") + output_pointer.write(f"rm '{first_files[file_hash]}'\n") - def _init_db_cache(self, cache_file='/var/cache/find_duplicate_files.cache.sql'): + def _init_db_cache(self, cache_file='/var/cache/FindDuplicateFiles.cache.sql'): + self._log.debug( + "Initializing database file '%s'...", + cache_file, + ) self.cache_file = cache_file self.cache_db = sqlite3.connect(self.cache_file) self.cur = self.cache_db.cursor() @@ -63,6 +75,10 @@ class find_duplicate_files: self.cache_db.commit() def _check_file_cache(self, file): + self._log.debug( + "Checking file '%s' in cache...", + file + ) file_sql = file.replace("'", "''") query = f"SELECT hash FROM files WHERE file = '{file_sql}'" row = False @@ -79,9 +95,14 @@ class find_duplicate_files: else: return False - def _cache_file(self, file, hash): + def _cache_file(self, file, file_hash): + self._log.debug( + "Adding file '%s' has '%s' to cache...", + file, + file_hash, + ) file_sql = file.replace("'", "''") - query = f"INSERT INTO files (file, hash) VALUES ('{file_sql}', '{hash}')" + query = f"INSERT INTO files (file, hash) VALUES ('{file_sql}', '{file_hash}')" result = False if isinstance(query, bytes): query = query.decode('utf-8') @@ -112,7 +133,7 @@ class find_duplicate_files: def recursive_scandir(self, path, ignore_hidden_files=True): ''' Recursively scan a directory for files''' - files = dict() + files = {} if os.path.exists(path): try: for file in os.scandir(path): @@ -128,9 +149,9 @@ class find_duplicate_files: else: with open(file.path, 'rb') as file_pointer: file_content = file_pointer.read() - hash = zlib.adler32(file_content) - files[hash] = file.path - self._cache_file(file.path, hash) + file_hash = zlib.adler32(file_content) + files[file_hash] = file.path + self._cache_file(file.path, file_hash) elif file.is_dir(follow_symlinks=False): more_files = self.recursive_scandir( file.path, @@ -147,7 +168,7 @@ class find_duplicate_files: def _init_log(self): ''' Initialize log object ''' - self._log = logging.getLogger("find_duplicate_files") + self._log = logging.getLogger("FindDuplicateFiles") self._log.setLevel(logging.DEBUG) sysloghandler = SysLogHandler() @@ -163,7 +184,7 @@ class find_duplicate_files: else: home_folder = os.environ.get('HOME', os.environ.get('USERPROFILE', '')) log_folder = os.path.join(home_folder, "log") - log_file = os.path.join(log_folder, "find_duplicate_files.log") + log_file = os.path.join(log_folder, "FindDuplicateFiles.log") if not os.path.exists(os.path.dirname(log_file)): os.mkdir(os.path.dirname(log_file)) @@ -192,8 +213,7 @@ class find_duplicate_files: @click.option('--delete-duplicates', default=False, is_flag=True, help='Delete duplicate files instead of creating commands file') @click_config_file.configuration_option() def __main__(debug_level, log_file, dummy, first_directory, second_directory, exclude, limit, output_file, delete_duplicates): - return find_duplicate_files(debug_level, log_file, dummy, first_directory, second_directory, exclude, limit, output_file, delete_duplicates) + return FindDuplicateFiles(debug_level, log_file, dummy, first_directory, second_directory, exclude, limit, output_file, delete_duplicates) if __name__ == "__main__": __main__() -