Skip to content

Commit

Permalink
Convert sentences to sql file
Browse files Browse the repository at this point in the history
  • Loading branch information
Jackson-S authored Oct 1, 2019
1 parent 563fa10 commit 00bf8a7
Show file tree
Hide file tree
Showing 9 changed files with 80 additions and 90 deletions.
62 changes: 18 additions & 44 deletions DictionaryEntry.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import xml.etree.ElementTree as ElementTree
import sqlite3

from dataclasses import dataclass
from typing import List, Optional, Set, Dict

import MeCab
import jaconv

VERB_BADGES = ["Ichidan", "Ichidan (くれる)", "Godan (〜ある)", "Godan (〜ぶ)", "Godan (〜ぐ)",
"Godan (いく・ゆく)", "Godan (〜く)", "Godan (〜む)", "Godan (〜ぬ)",
"Godan Irregular (〜る)", "Godan (〜る)", "Godan (〜す)",
Expand All @@ -23,49 +21,20 @@
**{x: "Noun" for x in NOUN_BADGES}
}


class Sentence:
PARSER = MeCab.Tagger("-Ochasen")

def __init__(self, tag: ElementTree.Element):
self.english: str = tag.attrib["en"]
self.japanese: str = tag.attrib["jp"]
self.keys: set[str] = self._get_keys(tag)
self.sense_indices = self._get_senses(tag)
self.furigana_html = self._generate_furigana(self.japanese)

def _get_keys(self, tag: ElementTree.Element) -> Set[str]:
return set(x.attrib["dictionary_form"] for x in tag.findall("index"))

def _get_senses(self, tag: ElementTree.Element) -> Dict[str, str]:
indices = filter(lambda x: "sense_index" in x.attrib, tag.findall("index"))
return {x.attrib["dictionary_form"]: x.attrib["sense_index"] for x in indices}

def _generate_furigana(self, japanese_sentence: str) -> str:
parser_output = self.PARSER.parse(japanese_sentence).splitlines()

result = []

for line in parser_output[:-1]:
tokens = line.split("\t")
if len(tokens) == 1:
result.append(tokens[0])
continue

replacement = jaconv.kata2hira(tokens[1])
if tokens[0] not in (replacement, jaconv.hira2kata(tokens[1])):
result.append(f"<ruby>{tokens[0]}<rt>{replacement}</rt></ruby>")
else:
result.append(tokens[0])
return "".join(result)

DB = sqlite3.connect("output/dictionary.db")

class Entry:
def __init__(self, page_title: str, language: str, entry_type: str):
self.page_title: str = page_title
self.page_id: str = "{}_{}_{}".format(language, entry_type, page_title)


@dataclass
class Sentence:
english: str
japanese: str


@dataclass
class Definition:
pos: List[str]
Expand All @@ -87,10 +56,10 @@ class Reading:


class JapaneseEntry(Entry):
def __init__(self, entry: ElementTree.Element, sentences: Dict[str, Sentence]):
def __init__(self, entry: ElementTree.Element):
super().__init__(entry.attrib["title"], "jp", "dictionary")
self.containing_kanji: List[str] = self._get_containing_kanji(entry)
self.sentences: List[Sentence] = self._get_sentences(sentences)
self.sentences: List[Sentence] = self._get_sentences()
self.readings: List[Reading] = self._get_readings(entry)
self.kanji: List[Reading] = self._get_kanji(entry)
self.definitions: List[Definition] = self._get_definitions(entry)
Expand Down Expand Up @@ -121,9 +90,14 @@ def _get_readings(self, tag: ElementTree.Element) -> List[Reading]:
result.append(Reading(name, info))
return result

def _get_sentences(self, sentences: Dict[str, Sentence]):
result = sentences.get(self.page_title, [])
result.sort(key=lambda x: int(x.sense_indices.get(self.page_title, 1000)))
def _get_sentences(self):
result = []
cursor = DB.cursor()
query = cursor.execute("SELECT sentence_en, sentence_html_ruby FROM Sentences WHERE word=?", (self.page_title, ))

for en, jp in query.fetchall():
result.append(Sentence(en, jp))

return result

def _get_containing_kanji(self, tag: ElementTree.Element) -> List[str]:
Expand Down
2 changes: 1 addition & 1 deletion assets/japanese_definition_page.html
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ <h3 class="section_heading" apple_mouseover_disable="1">Kanji in this Term</h3>
<summary class="section_heading" apple_mouseover_disable="1">Example Sentences</summary>
{% for sentence in entry.sentences %}
<article class="sentence">
<p>{{ sentence.furigana_html | safe }}</p>
<p>{{ sentence.japanese | safe }}</p>
<p>{{ sentence.english }}</p>
</article>
{% endfor %}
Expand Down
20 changes: 3 additions & 17 deletions combiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ def get_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("dictionary", type=str)
parser.add_argument("kanji", type=str)
parser.add_argument("sentences", type=str)
parser.add_argument("english_wordlist", type=str)
parser.add_argument("-o", type=str)
return parser.parse_args()
Expand All @@ -64,27 +63,14 @@ def create_kanji_pages(kanji_path: str, kanji_images: Set[str]) -> Dict[str, Kan
return result


def create_japanese_pages(dict_path: str, sentence_path: str) -> Dict[str, JapaneseEntry]:
sentence_tree = ElementTree.parse(sentence_path)
sentence_root = sentence_tree.getroot()

# Maps words to sentences containing them
sentence_index_list = {}
for item in sentence_root:
# Create a sentence object, which will process and split the sentence
sentence = Sentence(item)

for key in sentence.keys:
sentence_index_list.setdefault(key, [])
sentence_index_list[key].append(sentence)

def create_japanese_pages(dict_path: str) -> Dict[str, JapaneseEntry]:
dictionary_tree = ElementTree.parse(dict_path)
dictionary_root = dictionary_tree.getroot()

result = dict()

for entry in dictionary_root:
new_entry = JapaneseEntry(entry, sentence_index_list)
new_entry = JapaneseEntry(entry)
if new_entry.is_worth_adding():
result[new_entry.page_title] = new_entry

Expand Down Expand Up @@ -133,7 +119,7 @@ def main():

pages = {**pages, **create_kanji_pages(args.kanji, image_set)}

pages = {**pages, **create_japanese_pages(args.dictionary, args.sentences)}
pages = {**pages, **create_japanese_pages(args.dictionary)}

japanese_entries = set(filter(lambda x: isinstance(x, JapaneseEntry), pages.values()))

Expand Down
4 changes: 2 additions & 2 deletions compile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ tar -xzf ./assets/kanjivg.tar.xz -C ./build/OtherResources/Images

# Convert the sample sentences into a new, simplified XML file containing only needed data
echo "Processing sample sentences"
python3 ./sentence_converter.py ./input/sentences.csv ./input/jpn_indices.csv -o output/sentences.xml
python3 ./sentence_converter.py ./input/sentences.csv ./input/jpn_indices.csv -o output/dictionary.db

# Convert the similar kanji into a SQL database
echo "Compiling similar Kanji"
Expand All @@ -45,7 +45,7 @@ python3 ./dictionary_converter.py ./input/JMdict_e.xml

# Combine the simplified XML files into the output Apple Dictionary XML file.
echo "Combining processed files"
python3 ./combiner.py ./output/dictionary.xml ./output/kanji.xml ./output/sentences.xml ./input/english.txt -o ./build/JapaneseDictionary.xml
python3 ./combiner.py ./output/dictionary.xml ./output/kanji.xml ./input/english.txt -o ./build/JapaneseDictionary.xml

# Traverse to the output directory in preparation to build
echo "Building dictionary (This will take a long time, i.e. 10+ minutes"
Expand Down
4 changes: 2 additions & 2 deletions compile_sample.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,15 @@ tar -xzf ./assets/kanjivg.tar.xz -C ./build/OtherResources/Images
mkdir output

echo "Processing sample sentences"
python3 ./sentence_converter.py ./input/sentences.csv ./input/jpn_indices.csv -o output/sentences.xml
python3 ./sentence_converter.py ./input/sentences.csv ./input/jpn_indices.csv -o output/dictionary.db
echo "Compiling similar Kanji"
python3 ./kanji_relation_db.py
echo "Processing Kanji"
python3 ./kanjidic_converter.py ./input/kanjidic2_sample.xml
echo "Processing Dictionary"
python3 ./dictionary_converter.py ./input/JMdict_e_sample.xml
echo "Combining processed files"
python3 ./combiner.py ./output/dictionary.xml ./output/kanji.xml ./output/sentences.xml ./input/english.txt -o ./build/JapaneseDictionary.xml
python3 ./combiner.py ./output/dictionary.xml ./output/kanji.xml ./input/english.txt -o ./build/JapaneseDictionary.xml

cd build
echo "Building dictionary (This will take a long time, i.e. 10+ minutes!)"
Expand Down
2 changes: 1 addition & 1 deletion dictionary_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from typing import List, Tuple

db = sqlite3.connect("output/kanji.db")
db = sqlite3.connect("output/dictionary.db")
cursor = db.cursor()

CLASSIFICATIONS = {
Expand Down
2 changes: 1 addition & 1 deletion kanji_relation_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import sqlite3
from xml.etree import ElementTree

db = sqlite3.connect("output/kanji.db")
db = sqlite3.connect("output/dictionary.db")
cursor = db.cursor()

cursor.execute("""
Expand Down
2 changes: 1 addition & 1 deletion kanjidic_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import List
from dataclasses import dataclass

similar_db = sqlite3.connect("output/kanji.db")
similar_db = sqlite3.connect("output/dictionary.db")

@dataclass
class Reading:
Expand Down
72 changes: 51 additions & 21 deletions sentence_converter.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import csv
import MeCab
import sqlite3
import argparse
import xml.etree.ElementTree as ElementTree
import jaconv

from typing import Optional, List, Dict

PARSER = MeCab.Tagger("-Ochasen")

class WordIndex:
def __init__(self, parameters: str):
Expand Down Expand Up @@ -39,11 +42,38 @@ def __init__(self, jp_sentence: str, en_sentence: str, indices: str):
self.jp: str = jp_sentence
# The sentence in English
self.en: str = en_sentence
# The sentence in Japanese with Rubytext
self.jp_ruby: str = self.generate_ruby()

# A List of indices that the dictionary will use the assign appropriate
# sentences, made up of the words contained within the sentence.
self.indices: List[WordIndex] = self.generate_indices(indices)

def generate_ruby(self):
output = PARSER.parse(self.jp).splitlines()

result = []

for tokens in map(lambda x: x.split("\t"), output[:-1]):
# If there's no need for changes just add the original to result
if len(tokens) == 1:
result.append(tokens[0])

else:
kanji = tokens[0]
# Convert the katakana rubytext output to hiragana
hiragana = jaconv.kata2hira(tokens[1])
# Convert the original token to hiragana (to compare later)
katakana = jaconv.hira2kata(tokens[1])

# Compare the rubytext against the original to ensure they're unique.
if kanji != hiragana and kanji != katakana:
result.append(f"<ruby>{tokens[0]}<rt>{hiragana}</rt></ruby>")
else:
result.append(tokens[0])

return "".join(result)

def generate_indices(self, indices: str) -> List[WordIndex]:
result: List[WordIndex] = []

Expand All @@ -61,7 +91,7 @@ def main():
parser = argparse.ArgumentParser()
parser.add_argument("string_file", type=argparse.FileType("r"))
parser.add_argument("index_file", type=argparse.FileType("r"))
parser.add_argument("-output", "-o", type=argparse.FileType("wb"))
parser.add_argument("--database", "-o", type=str)
args = parser.parse_args()

# Create iterators for the input CSV files
Expand All @@ -79,34 +109,34 @@ def main():
sentence_pairs: List[SentencePair] = []

for jp_id, en_id, parameters in index_csv:
# Check there's at least one verified word
# Check there's at least one verified word ("~" indicates verification)
if "~" in parameters:
if jp_id in sentence_list and en_id in sentence_list:
jp_sentence = sentence_list[jp_id]
en_sentence = sentence_list[en_id]
sentence_pair = SentencePair(
jp_sentence, en_sentence, parameters)
sentence_pair = SentencePair(jp_sentence, en_sentence, parameters)
sentence_pairs.append(sentence_pair)

root = ElementTree.Element("sentences")
db = sqlite3.connect(args.database)
cursor = db.cursor()

# Generate an XML tree to output
for pair in sentence_pairs:
attributes = {"jp": pair.jp, "en": pair.en}
sentence_node = ElementTree.SubElement(root, "entry", attributes)
cursor.execute("""
CREATE TABLE IF NOT EXISTS Sentences (
word TEXT, -- The word that this sentence is an example for
sentence_en TEXT, -- The English translation of the sentence
sentence_jp TEXT, -- The original unmodified Japanese version of the sentence
sentence_html_ruby TEXT, -- The sentence with HTML rubytext tags added
UNIQUE (word, sentence_en, sentence_jp) -- Ensure all unique sentences
)
""")

for pair in sentence_pairs:
for index in pair.indices:
attributes = {"dictionary_form": index.dictionary_form}

if index.sense_number:
attributes["sense_index"] = index.sense_number

ElementTree.SubElement(sentence_node, "index", attributes)

# Output the XML tree
tree = ElementTree.ElementTree(root)
tree.write(args.output, "UTF-8", True)

cursor.execute("INSERT OR IGNORE INTO Sentences VALUES (?, ?, ?, ?)", (index.dictionary_form, pair.en, pair.jp, pair.jp_ruby))

cursor.close()
db.commit()
db.close()

if __name__ == "__main__":
main()

0 comments on commit 00bf8a7

Please sign in to comment.