Skip to content

Commit

Permalink
Escape problematic characters in lemmas in rule files
Browse files Browse the repository at this point in the history
related to #673

This way users never need to escape anything by hand except for
periods in tags.
  • Loading branch information
mr-martian committed Aug 14, 2024
1 parent 6794566 commit 0f80c47
Showing 1 changed file with 29 additions and 21 deletions.
50 changes: 29 additions & 21 deletions Utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1783,35 +1783,43 @@ def check_for_cat_errors(report, dbType, posFullNameStr, posAbbrStr, countList,

def stripRulesFile(report, buildFolder, tranferRulePath, strippedRulesFileName):

# Open the existing rule file and read all the lines
# Open the existing rule file
try:
f = open(tranferRulePath ,"r", encoding='utf-8')
# Note that by default this will strip comments and headers
# (even though that is no longer necessary on newer versions
# of apertium-transfer)
tree = ET.parse(transferRulePath).getroot()
except:
report.Error(f'Error in opening the file: "{tranferRulePath}", check that it exists.')
return True

lines = f.readlines()
f.close()
# Lemmas in <cat-item> are not compared for string equality,
# so we don't need to escape the other special characters,
# but * will be treated as a glob matching any sequence of characters,
# so we escape it here.
# If any users do want the glob behavior, we'll have a problem, but
# that strikes me as less likely.
for cat in tree.findall('.//cat-item'):
if 'lemma' in cat.attrib:
cat.attrib['lemma'] = cat.attrib['lemma'].replace('*', '\\*')

# If we're only doing one-stage transfer, then really we only need to
# escape things when we're comparing against input (so .//test//lit),
# but we might be doing multi-stage transfer and it doesn't hurt
# anything to also escape the output (and it's less complicated).
for tag in ['lit', 'list-item']:
for node in tree.findall('.//'+tag):
if 'v' in node.attrib:
for char in '\\*^$/<>{}':
node.attrib['v'] = node.attrib.replace(char, '\\'+char)

# Create a new file tr.t1x to be used by Apertium
f = open(os.path.join(buildFolder, strippedRulesFileName) ,"w", encoding='utf-8')

# Go through the existing rule file and write everything to the new file except Doctype stuff.
for line in lines:

strippedLine = line.strip()

if strippedLine == '<!DOCTYPE transfer PUBLIC "-//XMLmind//DTD transfer//EN"' or \
strippedLine == '<!DOCTYPE interchunk PUBLIC "-//XMLmind//DTD interchunk//EN"' or \
strippedLine == '<!DOCTYPE postchunk PUBLIC "-//XMLmind//DTD postchunk//EN"' or \
strippedLine == '"transfer.dtd">' or \
strippedLine == '"interchunk.dtd">' or \
strippedLine == '"postchunk.dtd">':
continue

outPath = os.path.join(buildFolder, strippedRulesFileName)
with open(outPath, 'w', encoding='utf-8') as fout:
text = ET.tostring(tree, encoding='unicode')
# Always write transfer rule data as decomposed
f.write(unicodedata.normalize('NFD', line))
f.close()
text = unicodedata.normalize('NFD', text)
fout.write(text)

return False

Expand Down

0 comments on commit 0f80c47

Please sign in to comment.