diff --git a/argument_parser.py b/argument_parser.py new file mode 100644 index 0000000..1f7c33d --- /dev/null +++ b/argument_parser.py @@ -0,0 +1,20 @@ +import argparse + +class ArgumentParser: + def __init__(self): + self.parser = self._setup_parser() + self.args = self.parser.parse_args() + + def _setup_parser(self): + parser = argparse.ArgumentParser( + prog="JMdict -> Dictionary.app converter" + ) + parser.add_argument('dictionary', metavar='path', type=str, help="The input dictionary") + parser.add_argument("-kvg", "-kanji_vg", help="Directory containing the KanjiVG library") + return parser + + def kanji_location(self): + return self.args.kanji_vg + + def dictionary_location(self): + return self.args.dictionary \ No newline at end of file diff --git a/english_entry_parser.py b/english_entry_parser.py new file mode 100644 index 0000000..0047f1a --- /dev/null +++ b/english_entry_parser.py @@ -0,0 +1,53 @@ +from entry import Entry + +class EnglishEntryParser: + def __init__(self, japanese_entries): + self.existing_entries = dict() + self._create_entries(japanese_entries) + self.entries = list(self.existing_entries.values()) + + @staticmethod + def _strip_definition(definition): + if "(" in definition: + return definition[:definition.index("(")] + return definition + + @staticmethod + def _get_senses(definition): + if "(" in definition and ")" in definition: + open_brace = definition.index("(") + close_brace = definition.index(")") + return [definition[open_brace+1:close_brace]] + return [] + + def _create_entries(self, japanese_entries): + for japanese_entry in japanese_entries: + for sense in japanese_entry.definitions: + for definition in sense.definition: + word = self._strip_definition(definition) + + try: + first_kanji_common = japanese_entry.kanji[0].is_common + except IndexError: + first_kanji_common = False + + if not first_kanji_common or not japanese_entry.readings[0].is_common: + continue + + if word not in self.existing_entries: + entry_id = f"en_{len(self.existing_entries) + 1}0" + entry = Entry(entry_id, word) + self.existing_entries[word] = entry + + entry = self.existing_entries[word] + + entry.add_index(word) + + # Add an index for e.g. to throw -> throw + if word.startswith("to"): + entry.add_index(word[2:]) + + entry_translation = [japanese_entry.title] + senses = self._get_senses(definition) + + entry.add_definition(entry_translation, sense_info=senses, misc_info=sense.misc_info) diff --git a/entry.py b/entry.py index 01c7dcc..d451fb8 100644 --- a/entry.py +++ b/entry.py @@ -1,21 +1,16 @@ +import re import jaconv +import xml.etree.ElementTree as ET from jinja2 import Template, Environment, select_autoescape, FileSystemLoader -import xml.etree.ElementTree as ET -from htmlmin import Minifier from kanji import Kanji from reading import Reading from definition import Definition +from kanji_vg_parser import KanjiImage -class Entry: - _MINIFIER = Minifier( - remove_comments=True, - remove_empty_space=True, - remove_all_empty_space=True, - remove_optional_attribute_quotes=False - ) - + +class Entry: _ENVIRONMENT = Environment( loader=FileSystemLoader("./entry_templates"), autoescape=select_autoescape( @@ -26,9 +21,12 @@ class Entry: _TEMPLATE = _ENVIRONMENT.get_template('standard_entry.html') + _KANJI_DB = KanjiImage("./kanji") + def __init__(self, entry_id, title): self.id = entry_id.strip() self.title = title.strip() + self.stroke_image = None self.indices = [] self.kanji = [] self.readings = [] @@ -88,13 +86,23 @@ def _compile_page(self, page_root): # Generate the page text using Jinja2 page_text = self._TEMPLATE.render(entry=self) # Minify the page output (So that compilation doesn't crash later from too much input) - minified_page = self._MINIFIER.minify(page_text) + minified_page = re.sub(r">[\s]*<", "><", page_text) # Go through each element and reimport it (Removes
tag from template, saving file size) - for element in ET.fromstring(minified_page): - page_root.append(element) + try: + for element in ET.fromstring(minified_page): + page_root.append(element) + except ET.ParseError as e: + print(minified_page) + raise e + + def _add_stroke_order(self): + if len(self.title) == 1 and self._KANJI_DB.has_image(self.title): + self.stroke_image = self._KANJI_DB.get_image_path(self.title) + def compile_entry(self): page_root_node = ET.Element("d:entry", { "id": self.id, "d:title": self.title }) + self._add_stroke_order() self._compile_indices(page_root_node) self._compile_page(page_root_node) return page_root_node \ No newline at end of file diff --git a/entry_templates/standard_entry.html b/entry_templates/standard_entry.html index 52413eb..9d11e21 100644 --- a/entry_templates/standard_entry.html +++ b/entry_templates/standard_entry.html @@ -23,54 +23,64 @@