Add initial script

This commit is contained in:
Antonio J. Delgado 2025-07-24 12:31:27 +03:00
parent 54d285054d
commit ee8f3f6081

View file

@ -9,6 +9,7 @@ import sys
import os import os
import json import json
import time import time
import re
import logging import logging
from logging.handlers import SysLogHandler from logging.handlers import SysLogHandler
import click import click
@ -48,6 +49,38 @@ class ExtractWords:
} }
self.data = self._read_cached_data() self.data = self._read_cached_data()
self._debug(
f"Reading file '{self.config['input_file'].name}'..."
)
content = self.config['input_file'].read()
self._debug(
"Unsplitting separated words..."
)
content = re.sub('- ?\n', '', content)
content = content.lower()
self._debug(
f"Splitting file of {len(content)} bytes..."
)
words = content.split()
self._debug(
f"Found {len(words)} a total (non-unique) words"
)
all_words = []
for word in words:
match = re.match(r'\w*', word)
if match:
if match.group(0) != '':
match_numbers = re.match(r'[0-9]', match.group(0))
if not match_numbers:
if match.group(0) not in all_words:
all_words.append(match.group(0))
self._debug(
f"A total of {len(all_words)} unique words"
)
all_words.sort()
self.config['output_file'].write('\n'.join(all_words))
def close(self): def close(self):
'''Close class and save data''' '''Close class and save data'''
self._save_cached_data(self.data) self._save_cached_data(self.data)
@ -181,6 +214,20 @@ class ExtractWords:
default=60*60*24*7, default=60*60*24*7,
help='Max age in seconds for the cache' help='Max age in seconds for the cache'
) )
@click.option(
'--input-file',
'-i',
required=True,
type=click.File('r'),
help='File containing words mixed with other things (like a book) in plain text',
)
@click.option(
'--output-file',
'-O',
required=True,
type=click.File('w'),
help='File to write the words',
)
# @click.option("--dummy","-n", is_flag=True, # @click.option("--dummy","-n", is_flag=True,
# help="Don't do anything, just show what would be done.") # help="Don't do anything, just show what would be done.")
@click_config_file.configuration_option() @click_config_file.configuration_option()