From b817fd2c7ec9436e27c792b8174e687948f53c46 Mon Sep 17 00:00:00 2001 From: "Antonio J. Delgado" Date: Wed, 25 Jan 2023 14:41:31 +0200 Subject: [PATCH] add scan of hashes --- find_duplicate_files/find_duplicate_files.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/find_duplicate_files/find_duplicate_files.py b/find_duplicate_files/find_duplicate_files.py index 6c139b0..ddff556 100644 --- a/find_duplicate_files/find_duplicate_files.py +++ b/find_duplicate_files/find_duplicate_files.py @@ -11,6 +11,7 @@ import logging import click import click_config_file from logging.handlers import SysLogHandler +import hashlib class find_duplicate_files: @@ -32,14 +33,23 @@ class find_duplicate_files: second_files = self.recursive_scandir(self.second_directory) self._log.debug(f"Found {len(second_files)} files in second directory '{second_directory}'") + for hash in first_files: + if hash in second_files: + print(f"#File '{first_files[hash]}' is dupe with '{second_files[hash]}'.") + print(f"rm '{first_files[hash]}'") + def recursive_scandir(self, path, ignore_hidden_files=True): ''' Recursively scan a directory for files''' - files = [] + files = dict() try: for file in os.scandir(path): if not file.name.startswith('.'): if file.is_file(): - files.append(file) + hash = hashlib.sha256() + with open(file.name, 'r') as file_pointer: + file_content = file_pointer.read() + hash.update(file_content) + files[hash.hexdigest()] = file.name elif file.is_dir(follow_symlinks=False): more_files = self.recursive_scandir( file.path,