From 55fb663768330c935387da9ee9b5ccbc217e8e69 Mon Sep 17 00:00:00 2001 From: "Antonio J. Delgado" Date: Mon, 13 Jun 2022 11:55:55 +0300 Subject: [PATCH] Find duplicates in vcards --- .../find_duplicate_contacts.py | 92 ++++++++++++++++--- requirements.txt | 3 +- 2 files changed, 82 insertions(+), 13 deletions(-) diff --git a/find_duplicate_contacts/find_duplicate_contacts.py b/find_duplicate_contacts/find_duplicate_contacts.py index 9ccee2f..5570329 100755 --- a/find_duplicate_contacts/find_duplicate_contacts.py +++ b/find_duplicate_contacts/find_duplicate_contacts.py @@ -12,10 +12,12 @@ import click import click_config_file from logging.handlers import SysLogHandler import vobject +import deepdiff +import shutil class find_duplicate_contacts: - def __init__(self, debug_level, log_file, directory): + def __init__(self, debug_level, log_file, directory, duplicates_destination): ''' Initial function called when object is created ''' self.config = dict() self.config['debug_level'] = debug_level @@ -24,23 +26,88 @@ class find_duplicate_contacts: self.config['log_file'] = log_file self._init_log() + self.ignore_fileds = [ + "prodid", + "uid", + "version", + "rev", + "x-thunderbird-etag", + "x-mozilla-html", + "photo" + ] + self.directory = directory + self.duplicates_destination = duplicates_destination + self.duplicates_folder = os.path.join(self.directory, self.duplicates_destination) + if not os.path.exists(self.duplicates_folder): + os.mkdir(self.duplicates_folder) + + self.entries = list() for entry in os.scandir(directory): self.entries.append(entry) - self.process_entries() + self.read_cards() - def process_entries(self): + self.compare_cards() + + def read_cards(self): + self.cards = [] for entry in self.entries: - with open(entry.path, 'r') as filep: - content=filep.read() - card = vobject.readOne(content) - print(entry.path) - print(card.contents.keys()) - sys.exit(0) - + self._log.debug(f"Reading vcard '{entry.path}'...") + card = {} + card['filename'] = entry.path + card['content'] = {} + if not entry.is_dir(): + with open(entry.path, 'r') as filep: + content=filep.read() + if len(content) > 0: + vcard = vobject.readOne(content) + + for key in vcard.contents.keys(): + if key not in self.ignore_fileds: + card['content'][key] = list() + for item in vcard.contents[key]: + card['content'][key].append(item.value) + self.cards.append(card) + def compare_cards(self): + checked_cards = [] + count = 0 + for card in self.cards: + count +=1 + duplicated = False + for checked_card in checked_cards: + if self.are_same_dict(card['content'], checked_card['content']): + duplicated = True + self._log.info(f"Duplicates:\n '{card['filename']}'\n '{checked_card['filename']}") + shutil.move( + card['filename'], + os.path.join(self.duplicates_folder, os.path.basename(card['filename'])) + ) + if not duplicated: + checked_cards.append(card) + self._log.info(f"Found {len(checked_cards)} unique cards") + + def are_same_dict(self, d1, d2): + ddiff = deepdiff.DeepDiff(d1, d2, ignore_order=True) + if ddiff == dict(): + return True + else: + if 'dictionary_item_added' in ddiff or 'dictionary_item_removed' in ddiff: + return False + else: + if 'values_changed' in ddiff: + real_change = False + for key in ddiff['values_changed'].keys(): + if isinstance(ddiff['values_changed'][key]['new_value'], str): + if ddiff['values_changed'][key]['new_value'].lower() != ddiff['values_changed'][key]['old_value'].lower(): + real_change = True + if real_change: + return False + else: + #print(ddiff) + return False def _init_log(self): ''' Initialize log object ''' @@ -81,9 +148,10 @@ class find_duplicate_contacts: ), help='Set the debug level for the standard output.') @click.option('--log-file', '-l', help="File to store all debug messages.") @click.option("--directory", "-d", help="Directory containing vCard files to check.") +@click.option('--duplicates-destination', '-D', default='duplicates', help='Directory to move duplicates files, relative to the directory containing the vCards.') @click_config_file.configuration_option() -def __main__(debug_level, log_file, directory): - return find_duplicate_contacts(debug_level, log_file, directory) +def __main__(debug_level, log_file, directory, duplicates_destination): + return find_duplicate_contacts(debug_level, log_file, directory, duplicates_destination) if __name__ == "__main__": __main__() diff --git a/requirements.txt b/requirements.txt index 3d62a26..911b7f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ click click_config_file -vobject \ No newline at end of file +vobject +deepdiff \ No newline at end of file