Escape problematic characters in lemmas in rule files

related to #673 This way users never need to escape anything by hand except for periods in tags.
rmlockwood · Aug 14, 2024 · 0f80c47 · 0f80c47
1 parent 6794566
commit 0f80c47
Showing 1 changed file with 29 additions and 21 deletions.
diff --git a/Utils.py b/Utils.py
@@ -1783,35 +1783,43 @@ def check_for_cat_errors(report, dbType, posFullNameStr, posAbbrStr, countList,
 
 def stripRulesFile(report, buildFolder, tranferRulePath, strippedRulesFileName):
 
-    # Open the existing rule file and read all the lines
+    # Open the existing rule file
     try:
-        f = open(tranferRulePath ,"r", encoding='utf-8')
+        # Note that by default this will strip comments and headers
+        # (even though that is no longer necessary on newer versions
+        # of apertium-transfer)
+        tree = ET.parse(transferRulePath).getroot()
     except:
         report.Error(f'Error in opening the file: "{tranferRulePath}", check that it exists.')
         return True
 
-    lines = f.readlines()
-    f.close()
+    # Lemmas in <cat-item> are not compared for string equality,
+    # so we don't need to escape the other special characters,
+    # but * will be treated as a glob matching any sequence of characters,
+    # so we escape it here.
+    # If any users do want the glob behavior, we'll have a problem, but
+    # that strikes me as less likely.
+    for cat in tree.findall('.//cat-item'):
+        if 'lemma' in cat.attrib:
+            cat.attrib['lemma'] = cat.attrib['lemma'].replace('*', '\\*')
+
+    # If we're only doing one-stage transfer, then really we only need to
+    # escape things when we're comparing against input (so .//test//lit),
+    # but we might be doing multi-stage transfer and it doesn't hurt
+    # anything to also escape the output (and it's less complicated).
+    for tag in ['lit', 'list-item']:
+        for node in tree.findall('.//'+tag):
+            if 'v' in node.attrib:
+                for char in '\\*^$/<>{}':
+                    node.attrib['v'] = node.attrib.replace(char, '\\'+char)
 
     # Create a new file tr.t1x to be used by Apertium
-    f = open(os.path.join(buildFolder, strippedRulesFileName) ,"w", encoding='utf-8')
-
-    # Go through the existing rule file and write everything to the new file except Doctype stuff.
-    for line in lines:
-
-        strippedLine = line.strip()
-
-        if strippedLine == '<!DOCTYPE transfer PUBLIC "-//XMLmind//DTD transfer//EN"' or \
-               strippedLine == '<!DOCTYPE interchunk PUBLIC "-//XMLmind//DTD interchunk//EN"' or \
-               strippedLine == '<!DOCTYPE postchunk PUBLIC "-//XMLmind//DTD postchunk//EN"' or \
-               strippedLine == '"transfer.dtd">' or \
-               strippedLine == '"interchunk.dtd">' or \
-               strippedLine == '"postchunk.dtd">':
-            continue
-
+    outPath = os.path.join(buildFolder, strippedRulesFileName)
+    with open(outPath, 'w', encoding='utf-8') as fout:
+        text = ET.tostring(tree, encoding='unicode')
         # Always write transfer rule data as decomposed
-        f.write(unicodedata.normalize('NFD', line))
-    f.close()
+        text = unicodedata.normalize('NFD', text)
+        fout.write(text)
 
     return False