diff --git a/argument_parser.py b/argument_parser.py new file mode 100644 index 0000000..1f7c33d --- /dev/null +++ b/argument_parser.py @@ -0,0 +1,20 @@ +import argparse + +class ArgumentParser: + def __init__(self): + self.parser = self._setup_parser() + self.args = self.parser.parse_args() + + def _setup_parser(self): + parser = argparse.ArgumentParser( + prog="JMdict -> Dictionary.app converter" + ) + parser.add_argument('dictionary', metavar='path', type=str, help="The input dictionary") + parser.add_argument("-kvg", "-kanji_vg", help="Directory containing the KanjiVG library") + return parser + + def kanji_location(self): + return self.args.kanji_vg + + def dictionary_location(self): + return self.args.dictionary \ No newline at end of file diff --git a/english_entry_parser.py b/english_entry_parser.py new file mode 100644 index 0000000..0047f1a --- /dev/null +++ b/english_entry_parser.py @@ -0,0 +1,53 @@ +from entry import Entry + +class EnglishEntryParser: + def __init__(self, japanese_entries): + self.existing_entries = dict() + self._create_entries(japanese_entries) + self.entries = list(self.existing_entries.values()) + + @staticmethod + def _strip_definition(definition): + if "(" in definition: + return definition[:definition.index("(")] + return definition + + @staticmethod + def _get_senses(definition): + if "(" in definition and ")" in definition: + open_brace = definition.index("(") + close_brace = definition.index(")") + return [definition[open_brace+1:close_brace]] + return [] + + def _create_entries(self, japanese_entries): + for japanese_entry in japanese_entries: + for sense in japanese_entry.definitions: + for definition in sense.definition: + word = self._strip_definition(definition) + + try: + first_kanji_common = japanese_entry.kanji[0].is_common + except IndexError: + first_kanji_common = False + + if not first_kanji_common or not japanese_entry.readings[0].is_common: + continue + + if word not in self.existing_entries: + entry_id = f"en_{len(self.existing_entries) + 1}0" + entry = Entry(entry_id, word) + self.existing_entries[word] = entry + + entry = self.existing_entries[word] + + entry.add_index(word) + + # Add an index for e.g. to throw -> throw + if word.startswith("to"): + entry.add_index(word[2:]) + + entry_translation = [japanese_entry.title] + senses = self._get_senses(definition) + + entry.add_definition(entry_translation, sense_info=senses, misc_info=sense.misc_info) diff --git a/entry.py b/entry.py index 01c7dcc..d451fb8 100644 --- a/entry.py +++ b/entry.py @@ -1,21 +1,16 @@ +import re import jaconv +import xml.etree.ElementTree as ET from jinja2 import Template, Environment, select_autoescape, FileSystemLoader -import xml.etree.ElementTree as ET -from htmlmin import Minifier from kanji import Kanji from reading import Reading from definition import Definition +from kanji_vg_parser import KanjiImage -class Entry: - _MINIFIER = Minifier( - remove_comments=True, - remove_empty_space=True, - remove_all_empty_space=True, - remove_optional_attribute_quotes=False - ) - + +class Entry: _ENVIRONMENT = Environment( loader=FileSystemLoader("./entry_templates"), autoescape=select_autoescape( @@ -26,9 +21,12 @@ class Entry: _TEMPLATE = _ENVIRONMENT.get_template('standard_entry.html') + _KANJI_DB = KanjiImage("./kanji") + def __init__(self, entry_id, title): self.id = entry_id.strip() self.title = title.strip() + self.stroke_image = None self.indices = [] self.kanji = [] self.readings = [] @@ -88,13 +86,23 @@ def _compile_page(self, page_root): # Generate the page text using Jinja2 page_text = self._TEMPLATE.render(entry=self) # Minify the page output (So that compilation doesn't crash later from too much input) - minified_page = self._MINIFIER.minify(page_text) + minified_page = re.sub(r">[\s]*<", "><", page_text) # Go through each element and reimport it (Removes tag from template, saving file size) - for element in ET.fromstring(minified_page): - page_root.append(element) + try: + for element in ET.fromstring(minified_page): + page_root.append(element) + except ET.ParseError as e: + print(minified_page) + raise e + + def _add_stroke_order(self): + if len(self.title) == 1 and self._KANJI_DB.has_image(self.title): + self.stroke_image = self._KANJI_DB.get_image_path(self.title) + def compile_entry(self): page_root_node = ET.Element("d:entry", { "id": self.id, "d:title": self.title }) + self._add_stroke_order() self._compile_indices(page_root_node) self._compile_page(page_root_node) return page_root_node \ No newline at end of file diff --git a/entry_templates/standard_entry.html b/entry_templates/standard_entry.html index 52413eb..9d11e21 100644 --- a/entry_templates/standard_entry.html +++ b/entry_templates/standard_entry.html @@ -23,54 +23,64 @@

「{{ entry.readings[0].reading }}」

{% endif %} - - {% if entry.kanji | count > 1 or entry.readings | count > 1 %} -
- - {% if entry.kanji | count > 1 %} -
- Alternative Forms - {% for kanji in entry.kanji[1:] %} -
-
{{ kanji.kanji }}
- {% for info in kanji.extra_info %} -
-
-
{{ info }}
+
+ + {% if entry.kanji | count > 1 or entry.readings | count > 1 %} +
+ + {% if entry.kanji | count > 1 %} +
+
Alternative Forms
+ {% for kanji in entry.kanji[1:] %} +
+
{{ kanji.kanji }}
+ {% for info in kanji.extra_info %} +
+
+
{{ info }}
+
+ {% endfor %} + {% if kanji.is_common %} +
Common Kanji
+ {% endif %}
{% endfor %} - {% if kanji.is_common %} -
Common Kanji
- {% endif %}
- {% endfor %} -
- {% endif %} + {% endif %} - - {% if entry.readings | count > 1 %} -
- Alternative Readings - {% for reading in entry.readings %} - {% if reading.reading != entry.title %} -
-
{{ reading.reading }}
- {% if reading.is_common %} -
Common Kana
+ + {% if entry.readings | count > 1 %} +
+ Alternative Readings + {% for reading in entry.readings %} + {% if reading.reading != entry.title %} +
+
{{ reading.reading }}
+ {% if reading.is_common %} +
Common Kana
+ {% endif %} + {% for related in reading.relates_to %} +
({{ related }})
+ {% endfor %} + {% if not reading.is_true_reading %} +
Reference Only
+ {% endif %} +
{% endif %} - {% for related in reading.relates_to %} -
({{ related }})
{% endfor %} - {% if not reading.is_true_reading %} -
Reference Only
- {% endif %}
{% endif %} - {% endfor %} +
+ {% endif %} + + + {% if entry.stroke_image != None %} +
+
Stroke Order
+
{% endif %}
- {% endif %} @@ -119,7 +129,8 @@

「{{ reading }}」

{% if reference.reference_id != None %} {% endif %} {% endfor %} diff --git a/japanese_entry_parser.py b/japanese_entry_parser.py new file mode 100644 index 0000000..1199300 --- /dev/null +++ b/japanese_entry_parser.py @@ -0,0 +1,109 @@ +import itertools + +import xml.etree.ElementTree as ET + +from reference_tracker import ReferenceTracker +from entry import Entry + + +class JapaneseEntryParser: + def __init__(self, dictionary_location): + self.ref_tracker = ReferenceTracker() + self.entries = [] + + # Get the root element of the input dictionary + input_root = ET.parse(dictionary_location).getroot() + + # Get a list of entries in the input dictionary XML + entry_tags = input_root.findall("entry") + + for entry_tag in entry_tags: + # Generate an id with a JP prefix to denote japanese page + entry_id = f"jp_{entry_tag.find('ent_seq').text}" + entry_title = self._get_entry_title(entry_tag) + + entry = Entry(entry_id, entry_title) + + self._get_kanji(entry_tag, entry) + self._get_readings(entry_tag, entry) + self._get_translations(entry_tag, entry) + + self.entries.append(entry) + + @staticmethod + def _generate_text_list(tag_search): + return [x.text for x in tag_search] + + @staticmethod + def _get_entry_title(entry_tag): + kanji_tags = entry_tag.findall("k_ele") + + # See if there's a Kanji title + if len(kanji_tags) != 0: + return kanji_tags[0].find("keb").text + + # Otherwise return kana + return entry_tag.find("r_ele").find("reb").text + + @staticmethod + def _get_kanji(entry_tag, entry): + kanji_elements = entry_tag.findall("k_ele") + + # Fetch all related data for each kanji element + for element in kanji_elements: + kanji = element.find("keb").text + information = JapaneseEntryParser._generate_text_list(element.findall("ke_inf")) + priority = JapaneseEntryParser._generate_text_list(element.findall("ke_pri")) + + # Create an index entry for the kanji + entry.add_index(kanji) + # Add the kanji to the entry + entry.add_kanji(kanji, information, priority) + + @staticmethod + def _get_readings(entry_tag, entry): + reading_elements = entry_tag.findall("r_ele") + + # Fetch all related data for each kanji element + for element in reading_elements: + reading = element.find("reb").text + information = JapaneseEntryParser._generate_text_list(element.findall("re_inf")) + related_kanji = JapaneseEntryParser._generate_text_list(element.findall("re_restr")) + priority = JapaneseEntryParser._generate_text_list(element.findall("re_pri")) + # A non-true reading does not contain the "re_nokanji" tag + is_true = element.find("re_nokanji") == None + + # Create an index for the reading + entry.add_index(reading) + # Add the reading to the entry + entry.add_reading(reading, information, is_true, related_kanji, priority) + + def _get_translations(self, entry_tag, entry): + for index, element in enumerate(entry_tag.findall("sense")): + # Chain the kanji and reading related tags together and generate a list + related_readings = JapaneseEntryParser._generate_text_list( + itertools.chain( + element.findall("stagk"), + element.findall("stagr") + ) + ) + + speech_parts = self._generate_text_list(element.findall("pos")) + x_references = self._generate_text_list(element.findall("xref")) + antonyms = self._generate_text_list(element.findall("ant")) + fields = self._generate_text_list(element.findall("field")) + misc = self._generate_text_list(element.findall("misc")) + senses = self._generate_text_list(element.findall("sense")) + source_langs = self._generate_text_list(element.findall("lsource")) + dialects = self._generate_text_list(element.findall("dial")) + translations = self._generate_text_list(element.findall("gloss")) + + self.ref_tracker.add_reference(index, entry.id, entry.title) + + entry.add_definition(translations, x_references, speech_parts, related_readings, antonyms, fields, misc, senses, source_langs, dialects) + + def get_entries(self): + return self.entries + + def get_references(self): + return self.ref_tracker \ No newline at end of file diff --git a/kanji_vg.zip b/kanji_vg.zip new file mode 100644 index 0000000..9427162 Binary files /dev/null and b/kanji_vg.zip differ diff --git a/kanji_vg_parser.py b/kanji_vg_parser.py new file mode 100644 index 0000000..0313251 --- /dev/null +++ b/kanji_vg_parser.py @@ -0,0 +1,41 @@ +from os import listdir, path +import xml.etree.ElementTree as ET + +class KanjiImage: + def __init__(self, search_directory): + primary_element_attribute = "{http://kanjivg.tagaini.net}element" + + self.index = dict() + + images = listdir(search_directory) + # Filter to only get svg images + images = filter(lambda x: ".svg" in x, images) + # Filter out all variation images of form "02414 - variation.svg" + images = filter(lambda x: "-" not in x, images) + # Attach the path to each image name + image_paths = map(lambda x: path.join(search_directory, x), images) + + for image_path in image_paths: + kanji_image_svg = ET.parse(image_path) + + # Search all elements in the SVG because namespacing is broken or maybe I can't work it out + for element in kanji_image_svg.findall(".//"): + attributes = element.attrib + if primary_element_attribute in attributes and "-" not in attributes["id"]: + character_represented = attributes[primary_element_attribute] + self.index[character_represented] = path.basename(image_path) + + def has_image(self, kanji): + return kanji in self.index + + def get_image(self, kanji): + if kanji in self.index: + return ET.parse(self.index[kanji]) + else: + return None + + def get_image_path(self, kanji): + if kanji in self.index: + return self.index[kanji] + else: + return None \ No newline at end of file diff --git a/main.py b/main.py index 31216b5..a980ebf 100644 --- a/main.py +++ b/main.py @@ -1,131 +1,50 @@ +import itertools + import xml.etree.ElementTree as ET +from argument_parser import ArgumentParser +from os import path, remove, listdir from tqdm import tqdm from dictionary import Dictionary -from entry import Entry - -print("Parsing input dictionary...") - -# input_tree = ET.parse("dictionaries/JMdict_e.xml") -input_tree = ET.parse("dictionaries/small_dict.xml") -input_root = input_tree.getroot() - -output_dictionary = Dictionary() -entries = [] -reference_dict = {} -reverse_words = {} - -entry_choices = input_root.findall("entry") - -print("Generating Japanese pages...") - -for child in entry_choices: - entry_id = "jp_{}".format(child.findall("ent_seq")[0].text) - if len(child.findall("k_ele")) != 0: - entry_title = child.findall("k_ele")[0].findall("keb")[0].text - else: - entry_title = child.findall("r_ele")[0].findall("reb")[0].text - - entry = Entry(entry_id, entry_title) +from japanese_entry_parser import JapaneseEntryParser +from english_entry_parser import EnglishEntryParser - for kanji_element in child.findall("k_ele"): - kanji = kanji_element.findall("keb")[0].text - kanji_information = [x.text for x in kanji_element.findall("ke_inf")] - kanji_priority = [x.text for x in reading_element.findall("ke_pri")] - entry.add_index(kanji) - entry.add_kanji(kanji, kanji_information, kanji_priority) +print("Parsing arguments") +args = ArgumentParser() - for reading_element in child.findall("r_ele"): - reading = reading_element.findall("reb")[0].text - reading_information = [x.text for x in reading_element.findall("re_inf")] - is_true_reading = len(reading_element.findall("re_nokanji")) == 0 - reading_relates_to = [x.text for x in reading_element.findall("re_restr")] - reading_priority = [x.text for x in reading_element.findall("re_pri")] +print("Parsing Japanese dictionary") +japanese_entry_parser = JapaneseEntryParser(args.dictionary_location()) - entry.add_index(reading) - entry.add_reading(reading, reading_information, is_true_reading, reading_relates_to, reading_priority) +japanese_entries = japanese_entry_parser.get_entries() +reference_tracker = japanese_entry_parser.get_references() - for index, sense_element in enumerate(child.findall("sense")): - related_readings = [] - for reading_element in sense_element.findall("stagk"): - related_readings.append(reading_element.text) - for reading_element in sense_element.findall("stagr"): - related_readings.append(reading_element.text) - part_of_speech = [x.text for x in sense_element.findall("pos")] - cross_references = [x.text for x in sense_element.findall("xref")] - antonyms = [x.text for x in sense_element.findall("ant")] - field = [x.text for x in sense_element.findall("field")] - misc_info = [x.text for x in sense_element.findall("misc")] - sense_info = [x.text for x in sense_element.findall("s_inf")] - language_source = [x.text for x in sense_element.findall("lsource")] - dialects = [x.text for x in sense_element.findall("dial")] - definitions = [x.text for x in sense_element.findall("gloss")] +print("Generating English entries") +english_entry_parser = EnglishEntryParser(japanese_entries) +english_entries = english_entry_parser.entries - reference_number = "・{}".format(index + 1) - if index == 0: - reference_dict[entry.title] = entry.id - reference_dict["{}{}".format(entry.title, reference_number)] = entry.id - - # Create a list of english translations - for gloss in sense_element.findall("gloss"): - stripped_gloss = gloss.text - brace_index = gloss.text.find("(") - if brace_index != -1: - stripped_gloss = stripped_gloss[:brace_index] - if stripped_gloss.count(" ") < 2: - if stripped_gloss in reverse_words: - if entry not in (x[0] for x in reverse_words[stripped_gloss]): - reverse_words[stripped_gloss].append( - [entry, gloss.text]) - else: - reverse_words[stripped_gloss] = [[entry, gloss.text]] - entry.add_definition(definitions, cross_references, part_of_speech, related_readings, antonyms, field, misc_info, sense_info, language_source, dialects) - - # Add the entry to the output array - entries.append(entry) - -print("Generating English pages...") +print("Generating output dictionary") +output_dictionary = Dictionary() -for index, word in enumerate(reverse_words): - entry = Entry("en_{}".format(index), word) - entry.add_index(word) - if word.startswith("to "): - entry.add_index(word[2:]) - for jp_entry, full_word in reverse_words[word]: - translation = [jp_entry.title] - sense_info = [] - if full_word != word: - clarification = full_word - if clarification.startswith(word): - clarification = clarification[len(word):] - clarification = clarification.strip("()") - - # Capitalise the first letter in the sentence - # (Can't use capitalize() because it doesn't do proper nouns) - clarification = "{}{}".format(clarification[0].upper(), clarification[1:]) - sense_info.append(clarification) - - entry.add_definition(translation, sense_info=sense_info) - entries.append(entry) +all_entries = [*japanese_entries, *english_entries] -error_entries = 0 +for entry in tqdm(all_entries): + output_dictionary.add_page(entry) -print("Generating cross reference links") -for entry in entries: - for definition in entry.definitions: - for reference in definition.cross_references: - if reference.reference_word in reference_dict: - reference.set_reference_id(reference_dict[reference.reference_word]) - else: - error_entries += 1 +print("Moving images") +image_output_path = path.join("project", "OtherResources", "Images") +image_input_path = path.join("kanji") +for file in listdir(image_output_path): + if ".svg" in file: + remove(path.join(image_output_path, file)) -print("Could not find {} reference(s)".format(error_entries)) - +for entry in japanese_entries: + if entry.stroke_image != None: + with open(path.join(image_input_path, entry.stroke_image), "rb") as in_file: + input_svg = in_file.read() + with open(path.join(image_output_path, entry.stroke_image), "wb") as out_file: + out_file.write(input_svg) -print("Generating output dictionary") -for entry in tqdm(entries): - output_dictionary.add_page(entry) print("Saving Dictionary") -output_dictionary.save_dictionary("project/JapaneseDictionary.xml") +output_dictionary.save_dictionary("project/JapaneseDictionary.xml") \ No newline at end of file diff --git a/project/JapaneseDictionary.css b/project/JapaneseDictionary.css index ea8ce1e..750616f 100644 --- a/project/JapaneseDictionary.css +++ b/project/JapaneseDictionary.css @@ -78,6 +78,10 @@ align-self: center; } +.alternativeFormsContainer, .alternativeReadingContainer { + width: 100%; +} + .alternativeFormsContainer { display: grid; grid-template-rows: min-content; @@ -85,7 +89,6 @@ background: rgb(240, 240, 240); border-radius: 5px; padding: 10px; - margin: 10px 0; } .titleContainer { @@ -127,6 +130,32 @@ align-self: center; } +.kanjiPathContainer { + background: rgb(240, 240, 240); + border-radius: 5px; + padding: 10px; +} + +.subheaderContainer { + display: flex; + justify-content: stretch; + max-width: 600px; + margin: 0; + padding: 0; +} + +.subheaderSubcontainer { + margin: 5px 0; +} + +.subheaderSubcontainer:first-of-type { + margin-right: 5px; +} + +.subheaderSubcontainer:last-of-type { + margin-left: 5px; +} + @media (prefers-dark-interface) { body { @@ -141,7 +170,7 @@ background: rgb(100, 100, 100); } - .alternativeFormsContainer { + .alternativeFormsContainer, .kanjiPathContainer { background: rgb(60, 60, 60); } diff --git a/reference_tracker.py b/reference_tracker.py new file mode 100644 index 0000000..d8f2342 --- /dev/null +++ b/reference_tracker.py @@ -0,0 +1,12 @@ +class ReferenceTracker: + def __init__(self): + self.references = dict() + + def add_reference(self, reference_index, page_id, title): + ref_format = f"{title}・{reference_index + 1}" + self.references[ref_format] = page_id + if reference_index == 0: + self.references[title] = page_id + + def get_reference(self, reference): + return self.references.get(reference, None) \ No newline at end of file diff --git a/screenshots/a.png b/screenshots/a.png index 5e6892c..bb92697 100644 Binary files a/screenshots/a.png and b/screenshots/a.png differ diff --git a/screenshots/b.png b/screenshots/b.png index e299269..ef6c1d9 100644 Binary files a/screenshots/b.png and b/screenshots/b.png differ