Convert sentences to sql file

Jackson-S · Oct 1, 2019 · 00bf8a7 · 00bf8a7
1 parent 563fa10
commit 00bf8a7
Show file tree

Hide file tree

Showing 9 changed files with 80 additions and 90 deletions.
diff --git a/DictionaryEntry.py b/DictionaryEntry.py
@@ -1,11 +1,9 @@
 import xml.etree.ElementTree as ElementTree
+import sqlite3
 
 from dataclasses import dataclass
 from typing import List, Optional, Set, Dict
 
-import MeCab
-import jaconv
-
 VERB_BADGES = ["Ichidan", "Ichidan (くれる)", "Godan (〜ある)", "Godan (〜ぶ)", "Godan (〜ぐ)",
                "Godan (いく・ゆく)", "Godan (〜く)", "Godan (〜む)", "Godan (〜ぬ)",
                "Godan Irregular (〜る)", "Godan (〜る)", "Godan (〜す)",
@@ -23,49 +21,20 @@
     **{x: "Noun" for x in NOUN_BADGES}
 }
 
-
-class Sentence:
-    PARSER = MeCab.Tagger("-Ochasen")
-
-    def __init__(self, tag: ElementTree.Element):
-        self.english: str = tag.attrib["en"]
-        self.japanese: str = tag.attrib["jp"]
-        self.keys: set[str] = self._get_keys(tag)
-        self.sense_indices = self._get_senses(tag)
-        self.furigana_html = self._generate_furigana(self.japanese)
-
-    def _get_keys(self, tag: ElementTree.Element) -> Set[str]:
-        return set(x.attrib["dictionary_form"] for x in tag.findall("index"))
-
-    def _get_senses(self, tag: ElementTree.Element) -> Dict[str, str]:
-        indices = filter(lambda x: "sense_index" in x.attrib, tag.findall("index"))
-        return {x.attrib["dictionary_form"]: x.attrib["sense_index"] for x in indices}
-
-    def _generate_furigana(self, japanese_sentence: str) -> str:
-        parser_output = self.PARSER.parse(japanese_sentence).splitlines()
-
-        result = []
-
-        for line in parser_output[:-1]:
-            tokens = line.split("\t")
-            if len(tokens) == 1:
-                result.append(tokens[0])
-                continue
-
-            replacement = jaconv.kata2hira(tokens[1])
-            if tokens[0] not in (replacement, jaconv.hira2kata(tokens[1])):
-                result.append(f"<ruby>{tokens[0]}<rt>{replacement}</rt></ruby>")
-            else:
-                result.append(tokens[0])
-        return "".join(result)
-
+DB = sqlite3.connect("output/dictionary.db")
 
 class Entry:
     def __init__(self, page_title: str, language: str, entry_type: str):
         self.page_title: str = page_title
         self.page_id: str = "{}_{}_{}".format(language, entry_type, page_title)
 
 
+@dataclass
+class Sentence:
+    english: str
+    japanese: str
+
+
 @dataclass
 class Definition:
     pos: List[str]
@@ -87,10 +56,10 @@ class Reading:
 
 
 class JapaneseEntry(Entry):
-    def __init__(self, entry: ElementTree.Element, sentences: Dict[str, Sentence]):
+    def __init__(self, entry: ElementTree.Element):
         super().__init__(entry.attrib["title"], "jp", "dictionary")
         self.containing_kanji: List[str] = self._get_containing_kanji(entry)
-        self.sentences: List[Sentence] = self._get_sentences(sentences)
+        self.sentences: List[Sentence] = self._get_sentences()
         self.readings: List[Reading] = self._get_readings(entry)
         self.kanji: List[Reading] = self._get_kanji(entry)
         self.definitions: List[Definition] = self._get_definitions(entry)
@@ -121,9 +90,14 @@ def _get_readings(self, tag: ElementTree.Element) -> List[Reading]:
             result.append(Reading(name, info))
         return result
 
-    def _get_sentences(self, sentences: Dict[str, Sentence]):
-        result = sentences.get(self.page_title, [])
-        result.sort(key=lambda x: int(x.sense_indices.get(self.page_title, 1000)))
+    def _get_sentences(self):
+        result = []
+        cursor = DB.cursor()
+        query = cursor.execute("SELECT sentence_en, sentence_html_ruby FROM Sentences WHERE word=?", (self.page_title, ))
+
+        for en, jp in query.fetchall():
+            result.append(Sentence(en, jp))
+
         return result
 
     def _get_containing_kanji(self, tag: ElementTree.Element) -> List[str]:

diff --git a/assets/japanese_definition_page.html b/assets/japanese_definition_page.html
@@ -57,7 +57,7 @@ <h3 class="section_heading" apple_mouseover_disable="1">Kanji in this Term</h3>
             <summary class="section_heading" apple_mouseover_disable="1">Example Sentences</summary>
             {% for sentence in entry.sentences %}
             <article class="sentence">
-                <p>{{ sentence.furigana_html | safe }}</p>
+                <p>{{ sentence.japanese | safe }}</p>
                 <p>{{ sentence.english }}</p>
             </article>
             {% endfor %}

diff --git a/combiner.py b/combiner.py
@@ -43,7 +43,6 @@ def get_arguments():
     parser = argparse.ArgumentParser()
     parser.add_argument("dictionary", type=str)
     parser.add_argument("kanji", type=str)
-    parser.add_argument("sentences", type=str)
     parser.add_argument("english_wordlist", type=str)
     parser.add_argument("-o", type=str)
     return parser.parse_args()
@@ -64,27 +63,14 @@ def create_kanji_pages(kanji_path: str, kanji_images: Set[str]) -> Dict[str, Kan
     return result
 
 
-def create_japanese_pages(dict_path: str, sentence_path: str) -> Dict[str, JapaneseEntry]:
-    sentence_tree = ElementTree.parse(sentence_path)
-    sentence_root = sentence_tree.getroot()
-
-    # Maps words to sentences containing them
-    sentence_index_list = {}
-    for item in sentence_root:
-        # Create a sentence object, which will process and split the sentence
-        sentence = Sentence(item)
-
-        for key in sentence.keys:
-            sentence_index_list.setdefault(key, [])
-            sentence_index_list[key].append(sentence)
-
+def create_japanese_pages(dict_path: str) -> Dict[str, JapaneseEntry]:
     dictionary_tree = ElementTree.parse(dict_path)
     dictionary_root = dictionary_tree.getroot()
 
     result = dict()
 
     for entry in dictionary_root:
-        new_entry = JapaneseEntry(entry, sentence_index_list)
+        new_entry = JapaneseEntry(entry)
         if new_entry.is_worth_adding():
             result[new_entry.page_title] = new_entry
 
@@ -133,7 +119,7 @@ def main():
 
     pages = {**pages, **create_kanji_pages(args.kanji, image_set)}
 
-    pages = {**pages, **create_japanese_pages(args.dictionary, args.sentences)}
+    pages = {**pages, **create_japanese_pages(args.dictionary)}
 
     japanese_entries = set(filter(lambda x: isinstance(x, JapaneseEntry), pages.values()))
 

diff --git a/compile.sh b/compile.sh
@@ -29,7 +29,7 @@ tar -xzf ./assets/kanjivg.tar.xz -C ./build/OtherResources/Images
 
 # Convert the sample sentences into a new, simplified XML file containing only needed data
 echo "Processing sample sentences"
-python3 ./sentence_converter.py ./input/sentences.csv ./input/jpn_indices.csv -o output/sentences.xml
+python3 ./sentence_converter.py ./input/sentences.csv ./input/jpn_indices.csv -o output/dictionary.db
 
 # Convert the similar kanji into a SQL database
 echo "Compiling similar Kanji"
@@ -45,7 +45,7 @@ python3 ./dictionary_converter.py ./input/JMdict_e.xml
 
 # Combine the simplified XML files into the output Apple Dictionary XML file.
 echo "Combining processed files"
-python3 ./combiner.py ./output/dictionary.xml ./output/kanji.xml ./output/sentences.xml ./input/english.txt -o ./build/JapaneseDictionary.xml
+python3 ./combiner.py ./output/dictionary.xml ./output/kanji.xml ./input/english.txt -o ./build/JapaneseDictionary.xml
 
 # Traverse to the output directory in preparation to build
 echo "Building dictionary (This will take a long time, i.e. 10+ minutes"

diff --git a/compile_sample.sh b/compile_sample.sh
@@ -33,15 +33,15 @@ tar -xzf ./assets/kanjivg.tar.xz -C ./build/OtherResources/Images
 mkdir output
 
 echo "Processing sample sentences"
-python3 ./sentence_converter.py ./input/sentences.csv ./input/jpn_indices.csv -o output/sentences.xml
+python3 ./sentence_converter.py ./input/sentences.csv ./input/jpn_indices.csv -o output/dictionary.db
 echo "Compiling similar Kanji"
 python3 ./kanji_relation_db.py
 echo "Processing Kanji"
 python3 ./kanjidic_converter.py ./input/kanjidic2_sample.xml
 echo "Processing Dictionary"
 python3 ./dictionary_converter.py ./input/JMdict_e_sample.xml
 echo "Combining processed files"
-python3 ./combiner.py ./output/dictionary.xml ./output/kanji.xml ./output/sentences.xml ./input/english.txt -o ./build/JapaneseDictionary.xml
+python3 ./combiner.py ./output/dictionary.xml ./output/kanji.xml ./input/english.txt -o ./build/JapaneseDictionary.xml
 
 cd build
 echo "Building dictionary (This will take a long time, i.e. 10+ minutes!)"

diff --git a/dictionary_converter.py b/dictionary_converter.py
@@ -4,7 +4,7 @@
 
 from typing import List, Tuple
 
-db = sqlite3.connect("output/kanji.db")
+db = sqlite3.connect("output/dictionary.db")
 cursor = db.cursor()
 
 CLASSIFICATIONS = {

diff --git a/kanji_relation_db.py b/kanji_relation_db.py
@@ -2,7 +2,7 @@
 import sqlite3
 from xml.etree import ElementTree
 
-db = sqlite3.connect("output/kanji.db")
+db = sqlite3.connect("output/dictionary.db")
 cursor = db.cursor()
 
 cursor.execute("""

diff --git a/kanjidic_converter.py b/kanjidic_converter.py
@@ -5,7 +5,7 @@
 from typing import List
 from dataclasses import dataclass
 
-similar_db = sqlite3.connect("output/kanji.db")
+similar_db = sqlite3.connect("output/dictionary.db")
 
 @dataclass
 class Reading:

diff --git a/sentence_converter.py b/sentence_converter.py
@@ -1,9 +1,12 @@
 import csv
+import MeCab
+import sqlite3
 import argparse
-import xml.etree.ElementTree as ElementTree
+import jaconv
 
 from typing import Optional, List, Dict
 
+PARSER = MeCab.Tagger("-Ochasen")
 
 class WordIndex:
     def __init__(self, parameters: str):
@@ -39,11 +42,38 @@ def __init__(self, jp_sentence: str, en_sentence: str, indices: str):
         self.jp: str = jp_sentence
         # The sentence in English
         self.en: str = en_sentence
+        # The sentence in Japanese with Rubytext
+        self.jp_ruby: str = self.generate_ruby()
 
         # A List of indices that the dictionary will use the assign appropriate
         # sentences, made up of the words contained within the sentence.
         self.indices: List[WordIndex] = self.generate_indices(indices)
 
+    def generate_ruby(self):
+        output = PARSER.parse(self.jp).splitlines()
+
+        result = []
+
+        for tokens in map(lambda x: x.split("\t"), output[:-1]):
+            # If there's no need for changes just add the original to result
+            if len(tokens) == 1:
+                result.append(tokens[0])
+
+            else:
+                kanji = tokens[0]
+                # Convert the katakana rubytext output to hiragana
+                hiragana = jaconv.kata2hira(tokens[1])
+                # Convert the original token to hiragana (to compare later)
+                katakana = jaconv.hira2kata(tokens[1])
+
+                # Compare the rubytext against the original to ensure they're unique.
+                if kanji != hiragana and kanji != katakana:
+                    result.append(f"<ruby>{tokens[0]}<rt>{hiragana}</rt></ruby>")
+                else:
+                    result.append(tokens[0])
+
+        return "".join(result)
+
     def generate_indices(self, indices: str) -> List[WordIndex]:
         result: List[WordIndex] = []
 
@@ -61,7 +91,7 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("string_file", type=argparse.FileType("r"))
     parser.add_argument("index_file", type=argparse.FileType("r"))
-    parser.add_argument("-output", "-o", type=argparse.FileType("wb"))
+    parser.add_argument("--database", "-o", type=str)
     args = parser.parse_args()
 
     # Create iterators for the input CSV files
@@ -79,34 +109,34 @@ def main():
     sentence_pairs: List[SentencePair] = []
 
     for jp_id, en_id, parameters in index_csv:
-        # Check there's at least one verified word
+        # Check there's at least one verified word ("~" indicates verification)
         if "~" in parameters:
             if jp_id in sentence_list and en_id in sentence_list:
                 jp_sentence = sentence_list[jp_id]
                 en_sentence = sentence_list[en_id]
-                sentence_pair = SentencePair(
-                    jp_sentence, en_sentence, parameters)
+                sentence_pair = SentencePair(jp_sentence, en_sentence, parameters)
                 sentence_pairs.append(sentence_pair)
 
-    root = ElementTree.Element("sentences")
+    db = sqlite3.connect(args.database)
+    cursor = db.cursor()
 
-    # Generate an XML tree to output
-    for pair in sentence_pairs:
-        attributes = {"jp": pair.jp, "en": pair.en}
-        sentence_node = ElementTree.SubElement(root, "entry", attributes)
+    cursor.execute("""
+    CREATE TABLE IF NOT EXISTS Sentences (
+        word TEXT, -- The word that this sentence is an example for
+        sentence_en TEXT, -- The English translation of the sentence
+        sentence_jp TEXT, -- The original unmodified Japanese version of the sentence
+        sentence_html_ruby TEXT, -- The sentence with HTML rubytext tags added
+        UNIQUE (word, sentence_en, sentence_jp) -- Ensure all unique sentences
+    )
+    """)
 
+    for pair in sentence_pairs:
         for index in pair.indices:
-            attributes = {"dictionary_form": index.dictionary_form}
-
-            if index.sense_number:
-                attributes["sense_index"] = index.sense_number
-
-            ElementTree.SubElement(sentence_node, "index", attributes)
-
-    # Output the XML tree
-    tree = ElementTree.ElementTree(root)
-    tree.write(args.output, "UTF-8", True)
-
+            cursor.execute("INSERT OR IGNORE INTO Sentences VALUES (?, ?, ?, ?)", (index.dictionary_form, pair.en, pair.jp, pair.jp_ruby))
+
+    cursor.close()
+    db.commit()
+    db.close()
 
 if __name__ == "__main__":
     main()