Skip to content

Commit

Permalink
Add Kanji stroke order and large rewrite
Browse files Browse the repository at this point in the history
  • Loading branch information
Jackson-S committed Aug 1, 2019
1 parent a66594d commit f621236
Show file tree
Hide file tree
Showing 12 changed files with 369 additions and 167 deletions.
20 changes: 20 additions & 0 deletions argument_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import argparse

class ArgumentParser:
def __init__(self):
self.parser = self._setup_parser()
self.args = self.parser.parse_args()

def _setup_parser(self):
parser = argparse.ArgumentParser(
prog="JMdict -> Dictionary.app converter"
)
parser.add_argument('dictionary', metavar='path', type=str, help="The input dictionary")
parser.add_argument("-kvg", "-kanji_vg", help="Directory containing the KanjiVG library")
return parser

def kanji_location(self):
return self.args.kanji_vg

def dictionary_location(self):
return self.args.dictionary
53 changes: 53 additions & 0 deletions english_entry_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from entry import Entry

class EnglishEntryParser:
def __init__(self, japanese_entries):
self.existing_entries = dict()
self._create_entries(japanese_entries)
self.entries = list(self.existing_entries.values())

@staticmethod
def _strip_definition(definition):
if "(" in definition:
return definition[:definition.index("(")]
return definition

@staticmethod
def _get_senses(definition):
if "(" in definition and ")" in definition:
open_brace = definition.index("(")
close_brace = definition.index(")")
return [definition[open_brace+1:close_brace]]
return []

def _create_entries(self, japanese_entries):
for japanese_entry in japanese_entries:
for sense in japanese_entry.definitions:
for definition in sense.definition:
word = self._strip_definition(definition)

try:
first_kanji_common = japanese_entry.kanji[0].is_common
except IndexError:
first_kanji_common = False

if not first_kanji_common or not japanese_entry.readings[0].is_common:
continue

if word not in self.existing_entries:
entry_id = f"en_{len(self.existing_entries) + 1}0"
entry = Entry(entry_id, word)
self.existing_entries[word] = entry

entry = self.existing_entries[word]

entry.add_index(word)

# Add an index for e.g. to throw -> throw
if word.startswith("to"):
entry.add_index(word[2:])

entry_translation = [japanese_entry.title]
senses = self._get_senses(definition)

entry.add_definition(entry_translation, sense_info=senses, misc_info=sense.misc_info)
34 changes: 21 additions & 13 deletions entry.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,16 @@
import re
import jaconv
import xml.etree.ElementTree as ET

from jinja2 import Template, Environment, select_autoescape, FileSystemLoader
import xml.etree.ElementTree as ET
from htmlmin import Minifier

from kanji import Kanji
from reading import Reading
from definition import Definition
from kanji_vg_parser import KanjiImage

class Entry:
_MINIFIER = Minifier(
remove_comments=True,
remove_empty_space=True,
remove_all_empty_space=True,
remove_optional_attribute_quotes=False
)


class Entry:
_ENVIRONMENT = Environment(
loader=FileSystemLoader("./entry_templates"),
autoescape=select_autoescape(
Expand All @@ -26,9 +21,12 @@ class Entry:

_TEMPLATE = _ENVIRONMENT.get_template('standard_entry.html')

_KANJI_DB = KanjiImage("./kanji")

def __init__(self, entry_id, title):
self.id = entry_id.strip()
self.title = title.strip()
self.stroke_image = None
self.indices = []
self.kanji = []
self.readings = []
Expand Down Expand Up @@ -88,13 +86,23 @@ def _compile_page(self, page_root):
# Generate the page text using Jinja2
page_text = self._TEMPLATE.render(entry=self)
# Minify the page output (So that compilation doesn't crash later from too much input)
minified_page = self._MINIFIER.minify(page_text)
minified_page = re.sub(r">[\s]*<", "><", page_text)
# Go through each element and reimport it (Removes <body> tag from template, saving file size)
for element in ET.fromstring(minified_page):
page_root.append(element)
try:
for element in ET.fromstring(minified_page):
page_root.append(element)
except ET.ParseError as e:
print(minified_page)
raise e

def _add_stroke_order(self):
if len(self.title) == 1 and self._KANJI_DB.has_image(self.title):
self.stroke_image = self._KANJI_DB.get_image_path(self.title)


def compile_entry(self):
page_root_node = ET.Element("d:entry", { "id": self.id, "d:title": self.title })
self._add_stroke_order()
self._compile_indices(page_root_node)
self._compile_page(page_root_node)
return page_root_node
87 changes: 49 additions & 38 deletions entry_templates/standard_entry.html
Original file line number Diff line number Diff line change
Expand Up @@ -23,54 +23,64 @@ <h1 class="containerItem reading">「{{ entry.readings[0].reading }}」</h1>
{% endif %}
</div>

<!-- Alternative Forms -->
{% if entry.kanji | count > 1 or entry.readings | count > 1 %}
<div class="alternativeFormsContainer">
<!-- Alternative Kanji -->
{% if entry.kanji | count > 1 %}
<div class="kanjiContainer">
<span class="containerHeader">Alternative Forms</span>
{% for kanji in entry.kanji[1:] %}
<div class="containerItem alternativeForm">
<div class="kanjiReading">{{ kanji.kanji }}</div>
{% for info in kanji.extra_info %}
<div class="subContainerItem info">
<div class="rightArrow"></div>
<div class="kanjiInfo info">{{ info }}</div>
<div class="subheaderContainer">
<!-- Alternative Forms -->
{% if entry.kanji | count > 1 or entry.readings | count > 1 %}
<div class="alternativeFormsContainer subheaderSubcontainer">
<!-- Alternative Kanji -->
{% if entry.kanji | count > 1 %}
<div class="kanjiContainer">
<div class="containerHeader">Alternative Forms</div>
{% for kanji in entry.kanji[1:] %}
<div class="containerItem alternativeForm">
<div class="kanjiReading">{{ kanji.kanji }}</div>
{% for info in kanji.extra_info %}
<div class="subContainerItem info">
<div class="rightArrow"></div>
<div class="kanjiInfo info">{{ info }}</div>
</div>
{% endfor %}
{% if kanji.is_common %}
<div class="subContainerItem badge common">Common Kanji</div>
{% endif %}
</div>
{% endfor %}
{% if kanji.is_common %}
<div class="subContainerItem badge common">Common Kanji</div>
{% endif %}
</div>
{% endfor %}
</div>
{% endif %}
{% endif %}

<!-- Alternative Readings -->
{% if entry.readings | count > 1 %}
<div class="readingsContainer">
<span class="containerHeader">Alternative Readings</span>
{% for reading in entry.readings %}
{% if reading.reading != entry.title %}
<div class="containerItem alternativeReading">
<div class="reading">{{ reading.reading }}</div>
{% if reading.is_common %}
<div class="subContainerItem badge common">Common Kana</div>
<!-- Alternative Readings -->
{% if entry.readings | count > 1 %}
<div class="readingsContainer subheaderSubcontainer">
<span class="containerHeader">Alternative Readings</span>
{% for reading in entry.readings %}
{% if reading.reading != entry.title %}
<div class="containerItem alternativeReading">
<div class="reading">{{ reading.reading }}</div>
{% if reading.is_common %}
<div class="subContainerItem badge common">Common Kana</div>
{% endif %}
{% for related in reading.relates_to %}
<div class="subCountainerItem relatesTo">({{ related }})</div>
{% endfor %}
{% if not reading.is_true_reading %}
<div class="untrueReading badge">Reference Only</div>
{% endif %}
</div>
{% endif %}
{% for related in reading.relates_to %}
<div class="subCountainerItem relatesTo">({{ related }})</div>
{% endfor %}
{% if not reading.is_true_reading %}
<div class="untrueReading badge">Reference Only</div>
{% endif %}
</div>
{% endif %}
{% endfor %}
</div>
{% endif %}

<!-- Stroke Order -->
{% if entry.stroke_image != None %}
<div class="kanjiPathContainer subheaderSubcontainer">
<div class="containerHeader">Stroke Order</div>
<img src="Images/{{ entry.stroke_image }}" />
</div>
{% endif %}
</div>
{% endif %}


<!-- Definitions -->
Expand Down Expand Up @@ -119,7 +129,8 @@ <h1 class="containerItem reading">「{{ reading }}」</h1>
{% if reference.reference_id != None %}
<div class="subContainerItem info">
<div class="subArrow"></div>
<div class="senseInfo info">See also: <a href="x-dictionary:r:{{ reference.reference_id }}">{{ reference.reference_word }}</a></div>
<div class="senseInfo info">See also: <a
href="x-dictionary:r:{{ reference.reference_id }}">{{ reference.reference_word }}</a></div>
</div>
{% endif %}
{% endfor %}
Expand Down
109 changes: 109 additions & 0 deletions japanese_entry_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import itertools

import xml.etree.ElementTree as ET

from reference_tracker import ReferenceTracker
from entry import Entry


class JapaneseEntryParser:
def __init__(self, dictionary_location):
self.ref_tracker = ReferenceTracker()
self.entries = []

# Get the root element of the input dictionary
input_root = ET.parse(dictionary_location).getroot()

# Get a list of entries in the input dictionary XML
entry_tags = input_root.findall("entry")

for entry_tag in entry_tags:
# Generate an id with a JP prefix to denote japanese page
entry_id = f"jp_{entry_tag.find('ent_seq').text}"
entry_title = self._get_entry_title(entry_tag)

entry = Entry(entry_id, entry_title)

self._get_kanji(entry_tag, entry)
self._get_readings(entry_tag, entry)
self._get_translations(entry_tag, entry)

self.entries.append(entry)

@staticmethod
def _generate_text_list(tag_search):
return [x.text for x in tag_search]

@staticmethod
def _get_entry_title(entry_tag):
kanji_tags = entry_tag.findall("k_ele")

# See if there's a Kanji title
if len(kanji_tags) != 0:
return kanji_tags[0].find("keb").text

# Otherwise return kana
return entry_tag.find("r_ele").find("reb").text

@staticmethod
def _get_kanji(entry_tag, entry):
kanji_elements = entry_tag.findall("k_ele")

# Fetch all related data for each kanji element
for element in kanji_elements:
kanji = element.find("keb").text
information = JapaneseEntryParser._generate_text_list(element.findall("ke_inf"))
priority = JapaneseEntryParser._generate_text_list(element.findall("ke_pri"))

# Create an index entry for the kanji
entry.add_index(kanji)
# Add the kanji to the entry
entry.add_kanji(kanji, information, priority)

@staticmethod
def _get_readings(entry_tag, entry):
reading_elements = entry_tag.findall("r_ele")

# Fetch all related data for each kanji element
for element in reading_elements:
reading = element.find("reb").text
information = JapaneseEntryParser._generate_text_list(element.findall("re_inf"))
related_kanji = JapaneseEntryParser._generate_text_list(element.findall("re_restr"))
priority = JapaneseEntryParser._generate_text_list(element.findall("re_pri"))
# A non-true reading does not contain the "re_nokanji" tag
is_true = element.find("re_nokanji") == None

# Create an index for the reading
entry.add_index(reading)
# Add the reading to the entry
entry.add_reading(reading, information, is_true, related_kanji, priority)

def _get_translations(self, entry_tag, entry):
for index, element in enumerate(entry_tag.findall("sense")):
# Chain the kanji and reading related tags together and generate a list
related_readings = JapaneseEntryParser._generate_text_list(
itertools.chain(
element.findall("stagk"),
element.findall("stagr")
)
)

speech_parts = self._generate_text_list(element.findall("pos"))
x_references = self._generate_text_list(element.findall("xref"))
antonyms = self._generate_text_list(element.findall("ant"))
fields = self._generate_text_list(element.findall("field"))
misc = self._generate_text_list(element.findall("misc"))
senses = self._generate_text_list(element.findall("sense"))
source_langs = self._generate_text_list(element.findall("lsource"))
dialects = self._generate_text_list(element.findall("dial"))
translations = self._generate_text_list(element.findall("gloss"))

self.ref_tracker.add_reference(index, entry.id, entry.title)

entry.add_definition(translations, x_references, speech_parts, related_readings, antonyms, fields, misc, senses, source_langs, dialects)

def get_entries(self):
return self.entries

def get_references(self):
return self.ref_tracker
Binary file added kanji_vg.zip
Binary file not shown.
Loading

0 comments on commit f621236

Please sign in to comment.