From ee8f3f60816d7d47056616df2a22be1049e812fc Mon Sep 17 00:00:00 2001 From: "Antonio J. Delgado" Date: Thu, 24 Jul 2025 12:31:27 +0300 Subject: [PATCH] Add initial script --- extract_words/extract_words.py | 47 ++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/extract_words/extract_words.py b/extract_words/extract_words.py index 8ada551..e99a15f 100755 --- a/extract_words/extract_words.py +++ b/extract_words/extract_words.py @@ -9,6 +9,7 @@ import sys import os import json import time +import re import logging from logging.handlers import SysLogHandler import click @@ -48,6 +49,38 @@ class ExtractWords: } self.data = self._read_cached_data() + self._debug( + f"Reading file '{self.config['input_file'].name}'..." + ) + content = self.config['input_file'].read() + self._debug( + "Unsplitting separated words..." + ) + content = re.sub('- ?\n', '', content) + content = content.lower() + self._debug( + f"Splitting file of {len(content)} bytes..." + ) + words = content.split() + self._debug( + f"Found {len(words)} a total (non-unique) words" + ) + all_words = [] + for word in words: + match = re.match(r'\w*', word) + if match: + if match.group(0) != '': + match_numbers = re.match(r'[0-9]', match.group(0)) + if not match_numbers: + if match.group(0) not in all_words: + all_words.append(match.group(0)) + self._debug( + f"A total of {len(all_words)} unique words" + ) + all_words.sort() + self.config['output_file'].write('\n'.join(all_words)) + + def close(self): '''Close class and save data''' self._save_cached_data(self.data) @@ -181,6 +214,20 @@ class ExtractWords: default=60*60*24*7, help='Max age in seconds for the cache' ) +@click.option( + '--input-file', + '-i', + required=True, + type=click.File('r'), + help='File containing words mixed with other things (like a book) in plain text', +) +@click.option( + '--output-file', + '-O', + required=True, + type=click.File('w'), + help='File to write the words', +) # @click.option("--dummy","-n", is_flag=True, # help="Don't do anything, just show what would be done.") @click_config_file.configuration_option()