diff --git a/find_duplicate_files/find_duplicate_files.py b/find_duplicate_files/find_duplicate_files.py index 6c139b0..ddff556 100644 --- a/find_duplicate_files/find_duplicate_files.py +++ b/find_duplicate_files/find_duplicate_files.py @@ -11,6 +11,7 @@ import logging import click import click_config_file from logging.handlers import SysLogHandler +import hashlib class find_duplicate_files: @@ -32,14 +33,23 @@ class find_duplicate_files: second_files = self.recursive_scandir(self.second_directory) self._log.debug(f"Found {len(second_files)} files in second directory '{second_directory}'") + for hash in first_files: + if hash in second_files: + print(f"#File '{first_files[hash]}' is dupe with '{second_files[hash]}'.") + print(f"rm '{first_files[hash]}'") + def recursive_scandir(self, path, ignore_hidden_files=True): ''' Recursively scan a directory for files''' - files = [] + files = dict() try: for file in os.scandir(path): if not file.name.startswith('.'): if file.is_file(): - files.append(file) + hash = hashlib.sha256() + with open(file.name, 'r') as file_pointer: + file_content = file_pointer.read() + hash.update(file_content) + files[hash.hexdigest()] = file.name elif file.is_dir(follow_symlinks=False): more_files = self.recursive_scandir( file.path,