-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Kanji stroke order and large rewrite
- Loading branch information
Showing
12 changed files
with
369 additions
and
167 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
import argparse | ||
|
||
class ArgumentParser: | ||
def __init__(self): | ||
self.parser = self._setup_parser() | ||
self.args = self.parser.parse_args() | ||
|
||
def _setup_parser(self): | ||
parser = argparse.ArgumentParser( | ||
prog="JMdict -> Dictionary.app converter" | ||
) | ||
parser.add_argument('dictionary', metavar='path', type=str, help="The input dictionary") | ||
parser.add_argument("-kvg", "-kanji_vg", help="Directory containing the KanjiVG library") | ||
return parser | ||
|
||
def kanji_location(self): | ||
return self.args.kanji_vg | ||
|
||
def dictionary_location(self): | ||
return self.args.dictionary |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
from entry import Entry | ||
|
||
class EnglishEntryParser: | ||
def __init__(self, japanese_entries): | ||
self.existing_entries = dict() | ||
self._create_entries(japanese_entries) | ||
self.entries = list(self.existing_entries.values()) | ||
|
||
@staticmethod | ||
def _strip_definition(definition): | ||
if "(" in definition: | ||
return definition[:definition.index("(")] | ||
return definition | ||
|
||
@staticmethod | ||
def _get_senses(definition): | ||
if "(" in definition and ")" in definition: | ||
open_brace = definition.index("(") | ||
close_brace = definition.index(")") | ||
return [definition[open_brace+1:close_brace]] | ||
return [] | ||
|
||
def _create_entries(self, japanese_entries): | ||
for japanese_entry in japanese_entries: | ||
for sense in japanese_entry.definitions: | ||
for definition in sense.definition: | ||
word = self._strip_definition(definition) | ||
|
||
try: | ||
first_kanji_common = japanese_entry.kanji[0].is_common | ||
except IndexError: | ||
first_kanji_common = False | ||
|
||
if not first_kanji_common or not japanese_entry.readings[0].is_common: | ||
continue | ||
|
||
if word not in self.existing_entries: | ||
entry_id = f"en_{len(self.existing_entries) + 1}0" | ||
entry = Entry(entry_id, word) | ||
self.existing_entries[word] = entry | ||
|
||
entry = self.existing_entries[word] | ||
|
||
entry.add_index(word) | ||
|
||
# Add an index for e.g. to throw -> throw | ||
if word.startswith("to"): | ||
entry.add_index(word[2:]) | ||
|
||
entry_translation = [japanese_entry.title] | ||
senses = self._get_senses(definition) | ||
|
||
entry.add_definition(entry_translation, sense_info=senses, misc_info=sense.misc_info) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
import itertools | ||
|
||
import xml.etree.ElementTree as ET | ||
|
||
from reference_tracker import ReferenceTracker | ||
from entry import Entry | ||
|
||
|
||
class JapaneseEntryParser: | ||
def __init__(self, dictionary_location): | ||
self.ref_tracker = ReferenceTracker() | ||
self.entries = [] | ||
|
||
# Get the root element of the input dictionary | ||
input_root = ET.parse(dictionary_location).getroot() | ||
|
||
# Get a list of entries in the input dictionary XML | ||
entry_tags = input_root.findall("entry") | ||
|
||
for entry_tag in entry_tags: | ||
# Generate an id with a JP prefix to denote japanese page | ||
entry_id = f"jp_{entry_tag.find('ent_seq').text}" | ||
entry_title = self._get_entry_title(entry_tag) | ||
|
||
entry = Entry(entry_id, entry_title) | ||
|
||
self._get_kanji(entry_tag, entry) | ||
self._get_readings(entry_tag, entry) | ||
self._get_translations(entry_tag, entry) | ||
|
||
self.entries.append(entry) | ||
|
||
@staticmethod | ||
def _generate_text_list(tag_search): | ||
return [x.text for x in tag_search] | ||
|
||
@staticmethod | ||
def _get_entry_title(entry_tag): | ||
kanji_tags = entry_tag.findall("k_ele") | ||
|
||
# See if there's a Kanji title | ||
if len(kanji_tags) != 0: | ||
return kanji_tags[0].find("keb").text | ||
|
||
# Otherwise return kana | ||
return entry_tag.find("r_ele").find("reb").text | ||
|
||
@staticmethod | ||
def _get_kanji(entry_tag, entry): | ||
kanji_elements = entry_tag.findall("k_ele") | ||
|
||
# Fetch all related data for each kanji element | ||
for element in kanji_elements: | ||
kanji = element.find("keb").text | ||
information = JapaneseEntryParser._generate_text_list(element.findall("ke_inf")) | ||
priority = JapaneseEntryParser._generate_text_list(element.findall("ke_pri")) | ||
|
||
# Create an index entry for the kanji | ||
entry.add_index(kanji) | ||
# Add the kanji to the entry | ||
entry.add_kanji(kanji, information, priority) | ||
|
||
@staticmethod | ||
def _get_readings(entry_tag, entry): | ||
reading_elements = entry_tag.findall("r_ele") | ||
|
||
# Fetch all related data for each kanji element | ||
for element in reading_elements: | ||
reading = element.find("reb").text | ||
information = JapaneseEntryParser._generate_text_list(element.findall("re_inf")) | ||
related_kanji = JapaneseEntryParser._generate_text_list(element.findall("re_restr")) | ||
priority = JapaneseEntryParser._generate_text_list(element.findall("re_pri")) | ||
# A non-true reading does not contain the "re_nokanji" tag | ||
is_true = element.find("re_nokanji") == None | ||
|
||
# Create an index for the reading | ||
entry.add_index(reading) | ||
# Add the reading to the entry | ||
entry.add_reading(reading, information, is_true, related_kanji, priority) | ||
|
||
def _get_translations(self, entry_tag, entry): | ||
for index, element in enumerate(entry_tag.findall("sense")): | ||
# Chain the kanji and reading related tags together and generate a list | ||
related_readings = JapaneseEntryParser._generate_text_list( | ||
itertools.chain( | ||
element.findall("stagk"), | ||
element.findall("stagr") | ||
) | ||
) | ||
|
||
speech_parts = self._generate_text_list(element.findall("pos")) | ||
x_references = self._generate_text_list(element.findall("xref")) | ||
antonyms = self._generate_text_list(element.findall("ant")) | ||
fields = self._generate_text_list(element.findall("field")) | ||
misc = self._generate_text_list(element.findall("misc")) | ||
senses = self._generate_text_list(element.findall("sense")) | ||
source_langs = self._generate_text_list(element.findall("lsource")) | ||
dialects = self._generate_text_list(element.findall("dial")) | ||
translations = self._generate_text_list(element.findall("gloss")) | ||
|
||
self.ref_tracker.add_reference(index, entry.id, entry.title) | ||
|
||
entry.add_definition(translations, x_references, speech_parts, related_readings, antonyms, fields, misc, senses, source_langs, dialects) | ||
|
||
def get_entries(self): | ||
return self.entries | ||
|
||
def get_references(self): | ||
return self.ref_tracker |
Binary file not shown.
Oops, something went wrong.