Improve performance fetching only once each message

2024-09-05 18:50:11 +03:00 · 2024-09-05 18:50:11 +03:00 · a2890e0264
commit a2890e0264
parent 6178df97b6
2 changed files with 435 additions and 54 deletions
--- a/imap_filter/imap_filter.py
+++ b/imap_filter/imap_filter.py
@ -14,6 +14,8 @@ import email
 from signal import signal, SIGINT
 import json
 import re
+import codecs
+import time
 import click
 import click_config_file

@ -21,6 +23,7 @@ class ImapFilter:
    '''IMAP filter tool'''

    def __init__(self, **kwargs):
+        start_time = time.time()
        self.config = kwargs
        if 'log_file' not in kwargs or kwargs['log_file'] is None:
            self.config['log_file'] = os.path.join(
@ -36,29 +39,28 @@ class ImapFilter:
            )
        self._init_log()
        signal(SIGINT, self._signal_handler)
-        self._convert_filters()
-        if self.config['filters_file']:
-            self._read_filters_file()
-        if len(self.config['filter']) == 0:
+        with open(self.config['filters_file'], 'r', encoding='UTF-8') as filters_file:
+            self.config['mailboxes'] = json.load(filters_file)
+        if len(self.config['mailboxes']) == 0:
            self._log.error(
-                "You must indicate either a filter or a filters-file. Use --help to see more details."
+                "Filters file is empty. Use --help to see more details."
            )
            sys.exit(1)
        self.connect_imap()
        self._process_filters()
-
-    def _read_filters_file(self):
-        with open(self.config['filters_file'], 'r', encoding='UTF-8') as filters_file:
-            new_filters = json.load(filters_file)
-        for new_filter in new_filters:
-            self.config['filter'].append(new_filter)
+        self._log.debug(
+            "Took %s seconds to process",
+            time.time() - start_time
+        )

    def _process_filters(self):
-        for mfilter in self.config['filter']:
-            self.matches = 0
-            self._log.debug("Processing filter '%s'...", mfilter)
-            self.imap.select(mailbox=mfilter['mailbox'], readonly=False)
-            self._log.debug("Searching for all messages in mailbox '%s'...", mfilter['mailbox'])
+        for mailbox in self.config['mailboxes']:
+            self._log.debug(
+                "Processing mailbox '%s'...",
+                mailbox['mailbox']
+            )
+            self.imap.select(mailbox=mailbox['mailbox'], readonly=False)
+            self._log.debug("Searching for all messages in mailbox '%s'...", mailbox['mailbox'])
            typ, data = self.imap.search('UTF-8', 'ALL')
            if typ != 'OK':
                self._log.error('Error, server replied: %s', data)
@ -68,9 +70,10 @@ class ImapFilter:
            self._log.debug(
                "Processing %s messages in mailbox '%s'...",
                total_msgs,
-                mfilter['mailbox']
+                mailbox['mailbox']
            )
            msg_count = 0
+            self.matches = 0
            for message_id in all_msgs_uids:
                msg_count += 1
                self._log.debug(
@ -87,25 +90,38 @@ class ImapFilter:
                    if typ != 'OK':
                        self._log.error('Error, server replied: %s', unseen_data)
                        return False
-                    self._process_message(message_id, data[0], mfilter)
+                    for mfilter in mailbox['filters']:
+                        self._log.debug("Processing filter '%s'...", mfilter['name'])
+                        self._process_message(message_id, data[0], mfilter)
                except imaplib.IMAP4.error as error:
                    self._log.error(
                        "Error fetching message. %s",
                        error
                    )
-            self._log.debug(
-                "A total of %s matches for this filter",
-                self.matches
+            if self.matches > 0:
+                self._log.info(
+                    "A total of %s matches for this mailbox",
+                    self.matches
+                )
+            else:
+                self._log.info(
+                    "No matches for this mailbox"
+                )
+        try:
+            self.imap.expunge()
+        except imaplib.IMAP4.abort as error:
+            self._log.error(
+                "Error expunging connection. %s",
+                error
            )
-        self.imap.expunge()
+        return True

    def _process_message(self, message_id, data, mfilter):
        '''Process a mail message'''
        if isinstance(data[1], int):
            self._log.warning(
-                "Response part is integer %s in data '%s'. Try again.",
-                data[1],
-                data[0]
+                "Unexpected response fetching message: '%s'. Try again.",
+                data
            )
            return False
        part = data[1].decode('utf-8')
@ -115,31 +131,52 @@ class ImapFilter:
            field_data = decoded_field[0][0]
        else:
            field_data = decoded_field[0][0].decode()
-        match = re.match(mfilter['regexp'], field_data)
+        if 'words' in mfilter:
+            regexp = '^(?=.*' + '.*|.*'.join(mfilter['words']) + '.*)'
+        else:
+            if 'regexp' in mfilter:
+                regexp = mfilter['regexp']
+            else:
+                self._log.error(
+                    "The filter '%s' doesn't have a 'words' or 'regexp' value. %s",
+                    mfilter['name'],
+                    mfilter
+                )
+        match = re.match(regexp, field_data)
        if match:
            self._log.info(
-                "Field '%s' => '%s', matches '%s'",
+                "Field '%s' => '%s', matches filter '%s'",
                mfilter['field'],
                field_data,
-                mfilter['regexp']
+                mfilter['name']
            )
            self.matches += 1
            if self.config['dummy']:
                self._log.info('Doing nothing (dummy run)')
            else:
                self._do_filter(message_id, mfilter)
+        return True

    def _do_filter(self, message_id, mfilter):
        if f"_action_{mfilter['action']}" in dir(self):
            function = getattr(self, f"_action_{mfilter['action']}")
+            # try:
            result = function(message_id, mfilter)
            self._log.debug(
                "Result: %s",
                result
            )
+            # except Exception as error:
+            #     self._log.error(
+            #         b"Error performing '%s' action with message %s. Filter: %s. Error: %s",
+            #         mfilter['action'],
+            #         message_id,
+            #         mfilter,
+            #         error
+            #     )

    def _action_move(self, message_id, mfilter):
-        self._log.debug(
+        self._log.info(
            "Moving message '%s' to '%s'...",
            message_id,
            mfilter['destination']
@ -152,7 +189,7 @@ class ImapFilter:
            "Creating mailbox '%s'...",
            mailbox
        )
-        typ, data = self.imap.create(mailbox)
+        typ, data = self.imap.create(mailbox.encode('utf-8', 'replace'))
        if typ != 'OK':
            self._log.error(
                'Error creating mailbox %s, server replied: %s',
@ -163,16 +200,20 @@ class ImapFilter:
        return True

    def _action_copy(self, message_id, mfilter):
-        self._log.debug(
+        self._log.info(
            "Copying message '%s' to '%s'...",
            message_id,
            mfilter['destination']
        )
-        typ, data = self.imap.copy(message_id, mfilter['destination'])
+        mailbox_encoded = codecs.encode(
+            mfilter['destination'],
+            encoding="utf-7"
+        ).replace(b"+", b"&")
+        typ, data = self.imap.copy(message_id, mailbox_encoded)
        if typ != 'OK':
            if b'[TRYCREATE]' in data[0]:
-                if self._create_mailbox(mfilter['destination']):
-                    typ, data = self.imap.copy(message_id, mfilter['destination'])
+                if self._create_mailbox(mailbox_encoded):
+                    typ, data = self.imap.copy(message_id,mailbox_encoded)
                    if typ != 'OK':
                        self._log.error(
                            'Error copying message to %s, server replied: %s',
@ -190,7 +231,7 @@ class ImapFilter:
        return True

    def _action_delete(self, message_id, mfilter):
-        self._log.debug(
+        self._log.info(
            "Deleting message '%s'...",
            message_id
        )
@ -205,7 +246,7 @@ class ImapFilter:
        return True

    def _action_mark_seen(self, message_id, mfilter):
-        self._log.debug(
+        self._log.info(
            "Marking as seen message '%s'...",
            message_id
        )
@ -219,12 +260,6 @@ class ImapFilter:
            return False
        return True

-    def _convert_filters(self):
-        new_filter = []
-        for old_filter in self.config['filter']:
-            new_filter.append(json.loads(old_filter))
-        self.config['filter'] = new_filter
-
    def connect_imap(self):
        '''Create connection object to the IMAP server'''
        self._log.debug(
@ -345,12 +380,7 @@ class ImapFilter:
    help='Whether to use a secure connection or not.'
 )
@click.option(
-    '--filter', '-f', required=False,
-    multiple=True,
-    help='Filter rule.'
-)
-@click.option(
-    '--filters-file', '-F', required=False,
+    '--filters-file', '-F', required=True,
    help='JSON file containing a list of dictionaries with the filter rules.'
 )
@click_config_file.configuration_option()
--- a/test.json
+++ b/test.json
@ -1,10 +1,361 @@
 [
    {
-        "name": "python",
-        "mailbox": "Feeds/Mastodon/Test",
-        "field": "Subject",
-        "regexp": "^(?=.*nvidia.*|.*Nvidia.*|.*ansible.*|.*Ansible.*|.*ubuntu.*|.*Ubuntu.*|.*blender.*|.*Blender.*|.*technology.*|.*Technology.*|.*msdos.*|.*dosbox.*|.*python.*|.*Python.*|.*devops.*|.*DevOps.*|.*forgejo.*|.*Forgejo.*|.*smartphone.*|.*Smartphone.*|.*SmartPhone.*|.*Android.*|.*android.*|.*github.*|.*Github.*|.*gitlab.*|.*Gitlab.*|.*#programming.*|.*TechCrunch.*|.*researchbuzz.*|.*ripencc.*|.*FCAI.*|.*TechDesk.*|.*#selfhosting.*|.*#selfhosted.*|.*#ai.*|.*#deepfake.*|.*#chatgpt.*|.*#tietotekniikka.*|.*#videogames.*|.*#software.*|.*#retrogaming.*|.*#web.*|.*#gaming.*|.*#pcgaming.*|.*#gamedev.*|.*#fairphone.*|.*#ebike.*|.*#windows.*|.*#speedrun.*|.*#cloud.*|.*#euhosted.*|.*#python.*|.*#steamdeck.*|.*#indiegame.*|.*#webdev.*|.*#rustlang.*|.*#valve.*|.*#intel.*|.*#dns.*|.*#digitaljustice.*|.*#tv.*)",
-        "action": "move",
-        "destination": "Feeds/Mastodon/Test/tech"
+        "mailbox": "Feeds/Mastodon",
+        "filters": [
+            {
+                "name": "tech",
+                "field": "Subject",
+                "words": [
+                    "nvidia",
+                    "ansible",
+                    "ubuntu",
+                    "#blender",
+                    "#technology",
+                    "msdos",
+                    "dosbox",
+                    "#python",
+                    "devops",
+                    "forgejo",
+                    "#smartphone",
+                    "#android",
+                    "github",
+                    "gitlab",
+                    "#programming",
+                    "TechCrunch",
+                    "researchbuzz",
+                    "ripencc",
+                    "FCAI",
+                    "TechDesk",
+                    "selfhost",
+                    "#ai",
+                    "#deepfake",
+                    "#chatgpt",
+                    "#tietotekniikka",
+                    "#videogames",
+                    "#software",
+                    "#retrogaming",
+                    "#web",
+                    "#gaming",
+                    "#pcgaming",
+                    "#gamedev",
+                    "#fairphone",
+                    "#ebike",
+                    "#windows",
+                    "#speedrun",
+                    "#cloud",
+                    "#euhosted",
+                    "#python",
+                    "#steamdeck",
+                    "#indiegame",
+                    "#webdev",
+                    "#rustlang",
+                    "#valve",
+                    "#intel",
+                    "#dns",
+                    "#digitaljustice",
+                    "#tv",
+                    "#internetarchive"
+                ],
+                "action": "move",
+                "destination": "Feeds/Mastodon/Tech"
+            },
+            {
+                "name": "tampere",
+                "field": "Subject",
+                "words": [
+                    "tampere",
+                    "Tampere",
+                    "pirkanmaa",
+                    "Pirkanmaa",
+                    "kaukajärv",
+                    "Kaukajärv",
+                    "#pirkkala"
+                ],
+                "action": "move",
+                "destination": "Feeds/Mastodon/Tampere"
+            },
+            {
+                "name": "tampere_body",
+                "field": "Body",
+                "regexp": "^.*\"content\": \"(?=.*tampere.*|.*Tampere.*|.*pirkanmaa.*|.*Pirkanmaa.*|.*kaukajärv.*|.*Kaukajärv.*|.*#pirkkala)",
+                "action": "move",
+                "destination": "Feeds/Mastodon/Tampere"
+            },
+            {
+                "name": "infosec",
+                "field": "Subject",
+                "words": [
+                    "#infosec",
+                    "#hacking",
+                    "#defcon",
+                    "thehackernews",
+                    "#cybersecurity",
+                    "#opsec",
+                    "#surveillance",
+                    "#encryption",
+                    "#security",
+                    "#spyware",
+                    "#ninjalab",
+                    "#yubikey",
+                    "#yubico"
+                ],
+                "action": "move",
+                "destination": "Feeds/Mastodon/InfoSec"
+            },
+            {
+                "name": "infosec_body",
+                "field": "Body",
+                "regexp": "^.*\"content\": \"(?=.*infosec.*|.*InfoSec .*|.*hacking.*|.*Hacking .*|.*defcon.*|.*DefCon .*|.*thehackernews.*|.*cybersecurity .*|.*Cybersecurity .*|.*opsec .*|.*OpSec .*|.*#surveillance.*|.*#encryption.*|.*#security.*|.*#spyware .*|.*#ninjalab.*|.*#yubikey.*|.*#yubico.*)",
+                "action": "move",
+                "destination": "Feeds/Mastodon/InfoSec"
+            },
+            {
+                "name": "movies",
+                "field": "Subject",
+                "words": [
+                    "#movie",
+                    "#film",
+                    "#trailer",
+                    "#moviesuggestion",
+                    "rottentomatoes",
+                    "RottenTomatoes",
+                    "#pelicula",
+                    "#cine"
+                ],
+                "action": "move",
+                "destination": "Feeds/Mastodon/Movies"
+            },
+            {
+                "name": "almeria",
+                "field": "Subject",
+                "words": [
+                    "#almeria",
+                    "#almería",
+                    "#cabodegata",
+                    "#costadelaluz"
+                ],
+                "action": "move",
+                "destination": "Feeds/Mastodon/Almería"
+            },
+            {
+                "name": "almeria",
+                "field": "Body",
+                "regexp": "^.*\"content\": \"(?=.*almeria.*|.*Almeria .*|.*Almería.*|.*almería .*|.*cabo de gata.*|.*Cabo De Gata .*|.*Cabo de Gata.*|.*Costa de la Luz .*|.*Costa De La Luz .*|.*costa de la luz .*)",
+                "action": "move",
+                "destination": "Feeds/Mastodon/Almería"
+            },
+            {
+                "name": "Jobs",
+                "field": "Subject",
+                "words": [
+                    "#gethired",
+                    "#joboffer",
+                    "#hiring",
+                    "#flossjobs",
+                    "#layoffs",
+                    "#lookingforwork",
+                    "#osjobjub",
+                    "#trabajo",
+                    "#laboral",
+                    "#jobhunt",
+                    "#fedihire",
+                    "#fedijobs",
+                    "#getfedihired",
+                    "#trabajoremoto",
+                    "#remotework",
+                    "#wfh",
+                    "#opensourcejobs"
+                ],
+                "action": "move",
+                "destination": "Feeds/Mastodon/Jobs"
+            },
+            {
+                "name": "OpenSource",
+                "field": "Subject",
+                "words": [
+                    "#pinephone",
+                    "#pinetime",
+                    "#pinetab",
+                    "#jellyfin",
+                    "linux",
+                    "foss",
+                    "opensource",
+                    "freesoftware",
+                    "LibreOffice",
+                    "libreoffice",
+                    "#sailfishos",
+                    "#nextcloud",
+                    "#righttorepair",
+                    "#phosh",
+                    "#debian",
+                    "#openstreetmap",
+                    "#mobian",
+                    "#gnome",
+                    "#kde",
+                    "#xfce",
+                    "#wayland",
+                    "#waydroid",
+                    "#steam",
+                    "#godot",
+                    "#inkscape",
+                    "#homeassistant",
+                    "#openhome"
+                ],
+                "action": "move",
+                "destination": "Feeds/Mastodon/linux"
+            },
+            {
+                "name": "Cats",
+                "field": "Subject",
+                "words": [
+                    "#caturday",
+                    "#catsofmastodon",
+                    "#martesdegatos"
+                ],
+                "action": "move",
+                "destination": "Feeds/Mastodon/Caturday"
+            },
+            {
+                "name": "Dogs",
+                "field": "Subject",
+                "words": [
+                    "#dogsofmastodon"
+                ],
+                "action": "move",
+                "destination": "Feeds/Mastodon/DogsOfMastodon"
+            },
+            {
+                "name": "Fediverse",
+                "field": "Subject",
+                "words": [
+                    "#fediverse",
+                    "#mastodon",
+                    "peertube",
+                    "pixelfeed",
+                    "activitypub",
+                    "ActivityPub",
+                    "#mastodonadmin"
+                ],
+                "action": "move",
+                "destination": "Feeds/Mastodon/Fediverse"
+            },
+            {
+                "name": "Humor",
+                "field": "Subject",
+                "words": [
+                    "#humor",
+                    "#funny",
+                    "#meme"
+                ],
+                "action": "move",
+                "destination": "Feeds/Mastodon/Humor"
+            },
+            {
+                "name": "New",
+                "field": "Subject",
+                "words": [
+                    "#news",
+                    "#noticias",
+                    "ElSaltoDiario",
+                    "earthquake",
+                    "zoom_earth"
+                ],
+                "action": "move",
+                "destination": "Feeds/Mastodon/News"
+            },
+            {
+                "name": "Jazz",
+                "field": "Subject",
+                "words": [
+                    "jazz",
+                    "Jazz"
+                ],
+                "action": "move",
+                "destination": "Feeds/Mastodon/News"
+            },
+            {
+                "name": "USPolitics",
+                "field": "Subject",
+                "words": [
+                    "#uspol",
+                    "#kamalaharris",
+                    "#trump",
+                    "#biden",
+                    "#gop",
+                    "#texas"
+                ],
+                "action": "move",
+                "destination": "Feeds/Mastodon/USPolitics"
+            },
+            {
+                "name": "España",
+                "field": "Subject",
+                "words": [
+                    "#spanish",
+                    "#spain",
+                    "#españa",
+                    "#espanja",
+                    "#malaga",
+                    "#málaga",
+                    "#madrid",
+                    "#andalucia",
+                    "#sevilla",
+                    "#barcelona",
+                    "#mallorca",
+                    "#ibiza",
+                    "#canaryislands",
+                    "#balear",
+                    "España"
+                ],
+                "action": "move",
+                "destination": "Feeds/Mastodon/España"
+            },
+            {
+                "name": "Birds",
+                "field": "Subject",
+                "words": [
+                    "#birds"
+                ],
+                "action": "move",
+                "destination": "Feeds/Mastodon/Birds"
+            },
+            {
+                "name": "Finland",
+                "field": "Subject",
+                "words": [
+                    "#finland",
+                    "#suomi"
+                ],
+                "action": "move",
+                "destination": "Feeds/Mastodon/Finland"
+            },
+            {
+                "name": "Autism",
+                "field": "Subject",
+                "words": [
+                    "#actuallyautistic",
+                    "#autism",
+                    "#autistic"
+                ],
+                "action": "move",
+                "destination": "Feeds/Mastodon/Autism"
+            },
+            {
+                "name": "Art",
+                "field": "Subject",
+                "words": [
+                    "#art",
+                    "#artist",
+                    "#streetart",
+                    "#mural",
+                    "#photography",
+                    "#fotografie",
+                    "#photoart",
+                    "#urbanart"
+                ],
+                "action": "move",
+                "destination": "Feeds/Mastodon/Art"
+            }
+        ]
    }
 ]