diff --git a/Utils.py b/Utils.py index 49093184..79c6176d 100644 --- a/Utils.py +++ b/Utils.py @@ -1783,35 +1783,43 @@ def check_for_cat_errors(report, dbType, posFullNameStr, posAbbrStr, countList, def stripRulesFile(report, buildFolder, tranferRulePath, strippedRulesFileName): - # Open the existing rule file and read all the lines + # Open the existing rule file try: - f = open(tranferRulePath ,"r", encoding='utf-8') + # Note that by default this will strip comments and headers + # (even though that is no longer necessary on newer versions + # of apertium-transfer) + tree = ET.parse(transferRulePath).getroot() except: report.Error(f'Error in opening the file: "{tranferRulePath}", check that it exists.') return True - lines = f.readlines() - f.close() + # Lemmas in are not compared for string equality, + # so we don't need to escape the other special characters, + # but * will be treated as a glob matching any sequence of characters, + # so we escape it here. + # If any users do want the glob behavior, we'll have a problem, but + # that strikes me as less likely. + for cat in tree.findall('.//cat-item'): + if 'lemma' in cat.attrib: + cat.attrib['lemma'] = cat.attrib['lemma'].replace('*', '\\*') + + # If we're only doing one-stage transfer, then really we only need to + # escape things when we're comparing against input (so .//test//lit), + # but we might be doing multi-stage transfer and it doesn't hurt + # anything to also escape the output (and it's less complicated). + for tag in ['lit', 'list-item']: + for node in tree.findall('.//'+tag): + if 'v' in node.attrib: + for char in '\\*^$/<>{}': + node.attrib['v'] = node.attrib.replace(char, '\\'+char) # Create a new file tr.t1x to be used by Apertium - f = open(os.path.join(buildFolder, strippedRulesFileName) ,"w", encoding='utf-8') - - # Go through the existing rule file and write everything to the new file except Doctype stuff. - for line in lines: - - strippedLine = line.strip() - - if strippedLine == '' or \ - strippedLine == '"interchunk.dtd">' or \ - strippedLine == '"postchunk.dtd">': - continue - + outPath = os.path.join(buildFolder, strippedRulesFileName) + with open(outPath, 'w', encoding='utf-8') as fout: + text = ET.tostring(tree, encoding='unicode') # Always write transfer rule data as decomposed - f.write(unicodedata.normalize('NFD', line)) - f.close() + text = unicodedata.normalize('NFD', text) + fout.write(text) return False