diff --git a/find_duplicate_files/find_duplicate_files.py b/find_duplicate_files/find_duplicate_files.py index c97f5a1..a96234c 100644 --- a/find_duplicate_files/find_duplicate_files.py +++ b/find_duplicate_files/find_duplicate_files.py @@ -13,10 +13,11 @@ import click_config_file from logging.handlers import SysLogHandler import zlib import sqlite3 +import re class find_duplicate_files: - def __init__(self, debug_level, log_file, dummy, first_directory, second_directory): + def __init__(self, debug_level, log_file, dummy, first_directory, second_directory, exclude): ''' Initial function called when object is created ''' self.config = dict() self.config['debug_level'] = debug_level @@ -28,6 +29,7 @@ class find_duplicate_files: self.dummy = dummy self.first_directory = first_directory self.second_directory = second_directory + self.exclude = exclude self._init_db_cache() @@ -73,29 +75,39 @@ class find_duplicate_files: else: return False + def _test_exclude(self, file_name): + """ Test if a filename match any of the exclusions """ + for exclude in self.exclude: + match = re.search(exclude, file_name) + if match: + return True + else: + return False + def recursive_scandir(self, path, ignore_hidden_files=True): ''' Recursively scan a directory for files''' files = dict() try: for file in os.scandir(path): if not file.name.startswith('.'): - if file.is_file(): - check_cache = self._check_file_cache(file.path) - if check_cache: - files[check_cache] = file.path - else: - with open(file.path, 'rb') as file_pointer: - file_content = file_pointer.read() - hash = zlib.adler32(file_content) - files[hash] = file.path - self._cache_file(file.path, hash) - elif file.is_dir(follow_symlinks=False): - more_files = self.recursive_scandir( - file.path, - ignore_hidden_files=ignore_hidden_files - ) - if more_files: - files = { **files, **more_files } + if not self._test_exclude(file.path): + if file.is_file(): + check_cache = self._check_file_cache(file.path) + if check_cache: + files[check_cache] = file.path + else: + with open(file.path, 'rb') as file_pointer: + file_content = file_pointer.read() + hash = zlib.adler32(file_content) + files[hash] = file.path + self._cache_file(file.path, hash) + elif file.is_dir(follow_symlinks=False): + more_files = self.recursive_scandir( + file.path, + ignore_hidden_files=ignore_hidden_files + ) + if more_files: + files = { **files, **more_files } except PermissionError as error: self._log.warning(f"Permission denied accessing folder '{path}'") self._log.debug(f"Found {len(files)} files. Cache contains {self._cache_size()} records.") @@ -142,9 +154,10 @@ class find_duplicate_files: @click.option("--dummy","-n", is_flag=True, help="Don't do anything, just show what would be done.") # Don't forget to add dummy to parameters of main function @click.option('--first-directory', '-f', required=True, help='First directory to find files AND TO DELETE FILES FROM!!!') @click.option('--second-directory', '-s', required=True, help='Second directory to find files') +@click.option('--exclude', '-e', multiple=True, help='Regular expression pattern to exclude from files and directories.') @click_config_file.configuration_option() -def __main__(debug_level, log_file, dummy, first_directory, second_directory): - return find_duplicate_files(debug_level, log_file, dummy, first_directory, second_directory) +def __main__(debug_level, log_file, dummy, first_directory, second_directory, exclude): + return find_duplicate_files(debug_level, log_file, dummy, first_directory, second_directory, exclude) if __name__ == "__main__": __main__()