Add initial script
This commit is contained in:
parent
54d285054d
commit
ee8f3f6081
1 changed files with 47 additions and 0 deletions
|
@ -9,6 +9,7 @@ import sys
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
|
import re
|
||||||
import logging
|
import logging
|
||||||
from logging.handlers import SysLogHandler
|
from logging.handlers import SysLogHandler
|
||||||
import click
|
import click
|
||||||
|
@ -48,6 +49,38 @@ class ExtractWords:
|
||||||
}
|
}
|
||||||
self.data = self._read_cached_data()
|
self.data = self._read_cached_data()
|
||||||
|
|
||||||
|
self._debug(
|
||||||
|
f"Reading file '{self.config['input_file'].name}'..."
|
||||||
|
)
|
||||||
|
content = self.config['input_file'].read()
|
||||||
|
self._debug(
|
||||||
|
"Unsplitting separated words..."
|
||||||
|
)
|
||||||
|
content = re.sub('- ?\n', '', content)
|
||||||
|
content = content.lower()
|
||||||
|
self._debug(
|
||||||
|
f"Splitting file of {len(content)} bytes..."
|
||||||
|
)
|
||||||
|
words = content.split()
|
||||||
|
self._debug(
|
||||||
|
f"Found {len(words)} a total (non-unique) words"
|
||||||
|
)
|
||||||
|
all_words = []
|
||||||
|
for word in words:
|
||||||
|
match = re.match(r'\w*', word)
|
||||||
|
if match:
|
||||||
|
if match.group(0) != '':
|
||||||
|
match_numbers = re.match(r'[0-9]', match.group(0))
|
||||||
|
if not match_numbers:
|
||||||
|
if match.group(0) not in all_words:
|
||||||
|
all_words.append(match.group(0))
|
||||||
|
self._debug(
|
||||||
|
f"A total of {len(all_words)} unique words"
|
||||||
|
)
|
||||||
|
all_words.sort()
|
||||||
|
self.config['output_file'].write('\n'.join(all_words))
|
||||||
|
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
'''Close class and save data'''
|
'''Close class and save data'''
|
||||||
self._save_cached_data(self.data)
|
self._save_cached_data(self.data)
|
||||||
|
@ -181,6 +214,20 @@ class ExtractWords:
|
||||||
default=60*60*24*7,
|
default=60*60*24*7,
|
||||||
help='Max age in seconds for the cache'
|
help='Max age in seconds for the cache'
|
||||||
)
|
)
|
||||||
|
@click.option(
|
||||||
|
'--input-file',
|
||||||
|
'-i',
|
||||||
|
required=True,
|
||||||
|
type=click.File('r'),
|
||||||
|
help='File containing words mixed with other things (like a book) in plain text',
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
'--output-file',
|
||||||
|
'-O',
|
||||||
|
required=True,
|
||||||
|
type=click.File('w'),
|
||||||
|
help='File to write the words',
|
||||||
|
)
|
||||||
# @click.option("--dummy","-n", is_flag=True,
|
# @click.option("--dummy","-n", is_flag=True,
|
||||||
# help="Don't do anything, just show what would be done.")
|
# help="Don't do anything, just show what would be done.")
|
||||||
@click_config_file.configuration_option()
|
@click_config_file.configuration_option()
|
||||||
|
|
Loading…
Reference in a new issue