Improve performance fetching only once each message

This commit is contained in:
Antonio J. Delgado 2024-09-05 18:50:11 +03:00
parent 6178df97b6
commit a2890e0264
2 changed files with 435 additions and 54 deletions

View file

@ -14,6 +14,8 @@ import email
from signal import signal, SIGINT
import json
import re
import codecs
import time
import click
import click_config_file
@ -21,6 +23,7 @@ class ImapFilter:
'''IMAP filter tool'''
def __init__(self, **kwargs):
start_time = time.time()
self.config = kwargs
if 'log_file' not in kwargs or kwargs['log_file'] is None:
self.config['log_file'] = os.path.join(
@ -36,29 +39,28 @@ class ImapFilter:
)
self._init_log()
signal(SIGINT, self._signal_handler)
self._convert_filters()
if self.config['filters_file']:
self._read_filters_file()
if len(self.config['filter']) == 0:
with open(self.config['filters_file'], 'r', encoding='UTF-8') as filters_file:
self.config['mailboxes'] = json.load(filters_file)
if len(self.config['mailboxes']) == 0:
self._log.error(
"You must indicate either a filter or a filters-file. Use --help to see more details."
"Filters file is empty. Use --help to see more details."
)
sys.exit(1)
self.connect_imap()
self._process_filters()
def _read_filters_file(self):
with open(self.config['filters_file'], 'r', encoding='UTF-8') as filters_file:
new_filters = json.load(filters_file)
for new_filter in new_filters:
self.config['filter'].append(new_filter)
self._log.debug(
"Took %s seconds to process",
time.time() - start_time
)
def _process_filters(self):
for mfilter in self.config['filter']:
self.matches = 0
self._log.debug("Processing filter '%s'...", mfilter)
self.imap.select(mailbox=mfilter['mailbox'], readonly=False)
self._log.debug("Searching for all messages in mailbox '%s'...", mfilter['mailbox'])
for mailbox in self.config['mailboxes']:
self._log.debug(
"Processing mailbox '%s'...",
mailbox['mailbox']
)
self.imap.select(mailbox=mailbox['mailbox'], readonly=False)
self._log.debug("Searching for all messages in mailbox '%s'...", mailbox['mailbox'])
typ, data = self.imap.search('UTF-8', 'ALL')
if typ != 'OK':
self._log.error('Error, server replied: %s', data)
@ -68,9 +70,10 @@ class ImapFilter:
self._log.debug(
"Processing %s messages in mailbox '%s'...",
total_msgs,
mfilter['mailbox']
mailbox['mailbox']
)
msg_count = 0
self.matches = 0
for message_id in all_msgs_uids:
msg_count += 1
self._log.debug(
@ -87,25 +90,38 @@ class ImapFilter:
if typ != 'OK':
self._log.error('Error, server replied: %s', unseen_data)
return False
self._process_message(message_id, data[0], mfilter)
for mfilter in mailbox['filters']:
self._log.debug("Processing filter '%s'...", mfilter['name'])
self._process_message(message_id, data[0], mfilter)
except imaplib.IMAP4.error as error:
self._log.error(
"Error fetching message. %s",
error
)
self._log.debug(
"A total of %s matches for this filter",
self.matches
if self.matches > 0:
self._log.info(
"A total of %s matches for this mailbox",
self.matches
)
else:
self._log.info(
"No matches for this mailbox"
)
try:
self.imap.expunge()
except imaplib.IMAP4.abort as error:
self._log.error(
"Error expunging connection. %s",
error
)
self.imap.expunge()
return True
def _process_message(self, message_id, data, mfilter):
'''Process a mail message'''
if isinstance(data[1], int):
self._log.warning(
"Response part is integer %s in data '%s'. Try again.",
data[1],
data[0]
"Unexpected response fetching message: '%s'. Try again.",
data
)
return False
part = data[1].decode('utf-8')
@ -115,31 +131,52 @@ class ImapFilter:
field_data = decoded_field[0][0]
else:
field_data = decoded_field[0][0].decode()
match = re.match(mfilter['regexp'], field_data)
if 'words' in mfilter:
regexp = '^(?=.*' + '.*|.*'.join(mfilter['words']) + '.*)'
else:
if 'regexp' in mfilter:
regexp = mfilter['regexp']
else:
self._log.error(
"The filter '%s' doesn't have a 'words' or 'regexp' value. %s",
mfilter['name'],
mfilter
)
match = re.match(regexp, field_data)
if match:
self._log.info(
"Field '%s' => '%s', matches '%s'",
"Field '%s' => '%s', matches filter '%s'",
mfilter['field'],
field_data,
mfilter['regexp']
mfilter['name']
)
self.matches += 1
if self.config['dummy']:
self._log.info('Doing nothing (dummy run)')
else:
self._do_filter(message_id, mfilter)
return True
def _do_filter(self, message_id, mfilter):
if f"_action_{mfilter['action']}" in dir(self):
function = getattr(self, f"_action_{mfilter['action']}")
# try:
result = function(message_id, mfilter)
self._log.debug(
"Result: %s",
result
)
# except Exception as error:
# self._log.error(
# b"Error performing '%s' action with message %s. Filter: %s. Error: %s",
# mfilter['action'],
# message_id,
# mfilter,
# error
# )
def _action_move(self, message_id, mfilter):
self._log.debug(
self._log.info(
"Moving message '%s' to '%s'...",
message_id,
mfilter['destination']
@ -152,7 +189,7 @@ class ImapFilter:
"Creating mailbox '%s'...",
mailbox
)
typ, data = self.imap.create(mailbox)
typ, data = self.imap.create(mailbox.encode('utf-8', 'replace'))
if typ != 'OK':
self._log.error(
'Error creating mailbox %s, server replied: %s',
@ -163,16 +200,20 @@ class ImapFilter:
return True
def _action_copy(self, message_id, mfilter):
self._log.debug(
self._log.info(
"Copying message '%s' to '%s'...",
message_id,
mfilter['destination']
)
typ, data = self.imap.copy(message_id, mfilter['destination'])
mailbox_encoded = codecs.encode(
mfilter['destination'],
encoding="utf-7"
).replace(b"+", b"&")
typ, data = self.imap.copy(message_id, mailbox_encoded)
if typ != 'OK':
if b'[TRYCREATE]' in data[0]:
if self._create_mailbox(mfilter['destination']):
typ, data = self.imap.copy(message_id, mfilter['destination'])
if self._create_mailbox(mailbox_encoded):
typ, data = self.imap.copy(message_id,mailbox_encoded)
if typ != 'OK':
self._log.error(
'Error copying message to %s, server replied: %s',
@ -190,7 +231,7 @@ class ImapFilter:
return True
def _action_delete(self, message_id, mfilter):
self._log.debug(
self._log.info(
"Deleting message '%s'...",
message_id
)
@ -205,7 +246,7 @@ class ImapFilter:
return True
def _action_mark_seen(self, message_id, mfilter):
self._log.debug(
self._log.info(
"Marking as seen message '%s'...",
message_id
)
@ -219,12 +260,6 @@ class ImapFilter:
return False
return True
def _convert_filters(self):
new_filter = []
for old_filter in self.config['filter']:
new_filter.append(json.loads(old_filter))
self.config['filter'] = new_filter
def connect_imap(self):
'''Create connection object to the IMAP server'''
self._log.debug(
@ -345,12 +380,7 @@ class ImapFilter:
help='Whether to use a secure connection or not.'
)
@click.option(
'--filter', '-f', required=False,
multiple=True,
help='Filter rule.'
)
@click.option(
'--filters-file', '-F', required=False,
'--filters-file', '-F', required=True,
help='JSON file containing a list of dictionaries with the filter rules.'
)
@click_config_file.configuration_option()

363
test.json
View file

@ -1,10 +1,361 @@
[
{
"name": "python",
"mailbox": "Feeds/Mastodon/Test",
"field": "Subject",
"regexp": "^(?=.*nvidia.*|.*Nvidia.*|.*ansible.*|.*Ansible.*|.*ubuntu.*|.*Ubuntu.*|.*blender.*|.*Blender.*|.*technology.*|.*Technology.*|.*msdos.*|.*dosbox.*|.*python.*|.*Python.*|.*devops.*|.*DevOps.*|.*forgejo.*|.*Forgejo.*|.*smartphone.*|.*Smartphone.*|.*SmartPhone.*|.*Android.*|.*android.*|.*github.*|.*Github.*|.*gitlab.*|.*Gitlab.*|.*#programming.*|.*TechCrunch.*|.*researchbuzz.*|.*ripencc.*|.*FCAI.*|.*TechDesk.*|.*#selfhosting.*|.*#selfhosted.*|.*#ai.*|.*#deepfake.*|.*#chatgpt.*|.*#tietotekniikka.*|.*#videogames.*|.*#software.*|.*#retrogaming.*|.*#web.*|.*#gaming.*|.*#pcgaming.*|.*#gamedev.*|.*#fairphone.*|.*#ebike.*|.*#windows.*|.*#speedrun.*|.*#cloud.*|.*#euhosted.*|.*#python.*|.*#steamdeck.*|.*#indiegame.*|.*#webdev.*|.*#rustlang.*|.*#valve.*|.*#intel.*|.*#dns.*|.*#digitaljustice.*|.*#tv.*)",
"action": "move",
"destination": "Feeds/Mastodon/Test/tech"
"mailbox": "Feeds/Mastodon",
"filters": [
{
"name": "tech",
"field": "Subject",
"words": [
"nvidia",
"ansible",
"ubuntu",
"#blender",
"#technology",
"msdos",
"dosbox",
"#python",
"devops",
"forgejo",
"#smartphone",
"#android",
"github",
"gitlab",
"#programming",
"TechCrunch",
"researchbuzz",
"ripencc",
"FCAI",
"TechDesk",
"selfhost",
"#ai",
"#deepfake",
"#chatgpt",
"#tietotekniikka",
"#videogames",
"#software",
"#retrogaming",
"#web",
"#gaming",
"#pcgaming",
"#gamedev",
"#fairphone",
"#ebike",
"#windows",
"#speedrun",
"#cloud",
"#euhosted",
"#python",
"#steamdeck",
"#indiegame",
"#webdev",
"#rustlang",
"#valve",
"#intel",
"#dns",
"#digitaljustice",
"#tv",
"#internetarchive"
],
"action": "move",
"destination": "Feeds/Mastodon/Tech"
},
{
"name": "tampere",
"field": "Subject",
"words": [
"tampere",
"Tampere",
"pirkanmaa",
"Pirkanmaa",
"kaukajärv",
"Kaukajärv",
"#pirkkala"
],
"action": "move",
"destination": "Feeds/Mastodon/Tampere"
},
{
"name": "tampere_body",
"field": "Body",
"regexp": "^.*\"content\": \"(?=.*tampere.*|.*Tampere.*|.*pirkanmaa.*|.*Pirkanmaa.*|.*kaukajärv.*|.*Kaukajärv.*|.*#pirkkala)",
"action": "move",
"destination": "Feeds/Mastodon/Tampere"
},
{
"name": "infosec",
"field": "Subject",
"words": [
"#infosec",
"#hacking",
"#defcon",
"thehackernews",
"#cybersecurity",
"#opsec",
"#surveillance",
"#encryption",
"#security",
"#spyware",
"#ninjalab",
"#yubikey",
"#yubico"
],
"action": "move",
"destination": "Feeds/Mastodon/InfoSec"
},
{
"name": "infosec_body",
"field": "Body",
"regexp": "^.*\"content\": \"(?=.*infosec.*|.*InfoSec .*|.*hacking.*|.*Hacking .*|.*defcon.*|.*DefCon .*|.*thehackernews.*|.*cybersecurity .*|.*Cybersecurity .*|.*opsec .*|.*OpSec .*|.*#surveillance.*|.*#encryption.*|.*#security.*|.*#spyware .*|.*#ninjalab.*|.*#yubikey.*|.*#yubico.*)",
"action": "move",
"destination": "Feeds/Mastodon/InfoSec"
},
{
"name": "movies",
"field": "Subject",
"words": [
"#movie",
"#film",
"#trailer",
"#moviesuggestion",
"rottentomatoes",
"RottenTomatoes",
"#pelicula",
"#cine"
],
"action": "move",
"destination": "Feeds/Mastodon/Movies"
},
{
"name": "almeria",
"field": "Subject",
"words": [
"#almeria",
"#almería",
"#cabodegata",
"#costadelaluz"
],
"action": "move",
"destination": "Feeds/Mastodon/Almería"
},
{
"name": "almeria",
"field": "Body",
"regexp": "^.*\"content\": \"(?=.*almeria.*|.*Almeria .*|.*Almería.*|.*almería .*|.*cabo de gata.*|.*Cabo De Gata .*|.*Cabo de Gata.*|.*Costa de la Luz .*|.*Costa De La Luz .*|.*costa de la luz .*)",
"action": "move",
"destination": "Feeds/Mastodon/Almería"
},
{
"name": "Jobs",
"field": "Subject",
"words": [
"#gethired",
"#joboffer",
"#hiring",
"#flossjobs",
"#layoffs",
"#lookingforwork",
"#osjobjub",
"#trabajo",
"#laboral",
"#jobhunt",
"#fedihire",
"#fedijobs",
"#getfedihired",
"#trabajoremoto",
"#remotework",
"#wfh",
"#opensourcejobs"
],
"action": "move",
"destination": "Feeds/Mastodon/Jobs"
},
{
"name": "OpenSource",
"field": "Subject",
"words": [
"#pinephone",
"#pinetime",
"#pinetab",
"#jellyfin",
"linux",
"foss",
"opensource",
"freesoftware",
"LibreOffice",
"libreoffice",
"#sailfishos",
"#nextcloud",
"#righttorepair",
"#phosh",
"#debian",
"#openstreetmap",
"#mobian",
"#gnome",
"#kde",
"#xfce",
"#wayland",
"#waydroid",
"#steam",
"#godot",
"#inkscape",
"#homeassistant",
"#openhome"
],
"action": "move",
"destination": "Feeds/Mastodon/linux"
},
{
"name": "Cats",
"field": "Subject",
"words": [
"#caturday",
"#catsofmastodon",
"#martesdegatos"
],
"action": "move",
"destination": "Feeds/Mastodon/Caturday"
},
{
"name": "Dogs",
"field": "Subject",
"words": [
"#dogsofmastodon"
],
"action": "move",
"destination": "Feeds/Mastodon/DogsOfMastodon"
},
{
"name": "Fediverse",
"field": "Subject",
"words": [
"#fediverse",
"#mastodon",
"peertube",
"pixelfeed",
"activitypub",
"ActivityPub",
"#mastodonadmin"
],
"action": "move",
"destination": "Feeds/Mastodon/Fediverse"
},
{
"name": "Humor",
"field": "Subject",
"words": [
"#humor",
"#funny",
"#meme"
],
"action": "move",
"destination": "Feeds/Mastodon/Humor"
},
{
"name": "New",
"field": "Subject",
"words": [
"#news",
"#noticias",
"ElSaltoDiario",
"earthquake",
"zoom_earth"
],
"action": "move",
"destination": "Feeds/Mastodon/News"
},
{
"name": "Jazz",
"field": "Subject",
"words": [
"jazz",
"Jazz"
],
"action": "move",
"destination": "Feeds/Mastodon/News"
},
{
"name": "USPolitics",
"field": "Subject",
"words": [
"#uspol",
"#kamalaharris",
"#trump",
"#biden",
"#gop",
"#texas"
],
"action": "move",
"destination": "Feeds/Mastodon/USPolitics"
},
{
"name": "España",
"field": "Subject",
"words": [
"#spanish",
"#spain",
"#españa",
"#espanja",
"#malaga",
"#málaga",
"#madrid",
"#andalucia",
"#sevilla",
"#barcelona",
"#mallorca",
"#ibiza",
"#canaryislands",
"#balear",
"España"
],
"action": "move",
"destination": "Feeds/Mastodon/España"
},
{
"name": "Birds",
"field": "Subject",
"words": [
"#birds"
],
"action": "move",
"destination": "Feeds/Mastodon/Birds"
},
{
"name": "Finland",
"field": "Subject",
"words": [
"#finland",
"#suomi"
],
"action": "move",
"destination": "Feeds/Mastodon/Finland"
},
{
"name": "Autism",
"field": "Subject",
"words": [
"#actuallyautistic",
"#autism",
"#autistic"
],
"action": "move",
"destination": "Feeds/Mastodon/Autism"
},
{
"name": "Art",
"field": "Subject",
"words": [
"#art",
"#artist",
"#streetart",
"#mural",
"#photography",
"#fotografie",
"#photoart",
"#urbanart"
],
"action": "move",
"destination": "Feeds/Mastodon/Art"
}
]
}
]