Add initial script

This commit is contained in:
Antonio J. Delgado 2025-07-24 12:31:27 +03:00
parent 54d285054d
commit ee8f3f6081

View file

@ -9,6 +9,7 @@ import sys
import os
import json
import time
import re
import logging
from logging.handlers import SysLogHandler
import click
@ -48,6 +49,38 @@ class ExtractWords:
}
self.data = self._read_cached_data()
self._debug(
f"Reading file '{self.config['input_file'].name}'..."
)
content = self.config['input_file'].read()
self._debug(
"Unsplitting separated words..."
)
content = re.sub('- ?\n', '', content)
content = content.lower()
self._debug(
f"Splitting file of {len(content)} bytes..."
)
words = content.split()
self._debug(
f"Found {len(words)} a total (non-unique) words"
)
all_words = []
for word in words:
match = re.match(r'\w*', word)
if match:
if match.group(0) != '':
match_numbers = re.match(r'[0-9]', match.group(0))
if not match_numbers:
if match.group(0) not in all_words:
all_words.append(match.group(0))
self._debug(
f"A total of {len(all_words)} unique words"
)
all_words.sort()
self.config['output_file'].write('\n'.join(all_words))
def close(self):
'''Close class and save data'''
self._save_cached_data(self.data)
@ -181,6 +214,20 @@ class ExtractWords:
default=60*60*24*7,
help='Max age in seconds for the cache'
)
@click.option(
'--input-file',
'-i',
required=True,
type=click.File('r'),
help='File containing words mixed with other things (like a book) in plain text',
)
@click.option(
'--output-file',
'-O',
required=True,
type=click.File('w'),
help='File to write the words',
)
# @click.option("--dummy","-n", is_flag=True,
# help="Don't do anything, just show what would be done.")
@click_config_file.configuration_option()