Do not modify contents of <note> tag (#86)

* Do not modify contents of <note> tag This changes the behaviour around normalizing the <note> field of .glif files. Previously the contents of this element would be broken into lines and reindented, but this was a potentially destructive operation. With this patch, any content of a <note> element is passed through unchanged. For more discussion, see: #85 * Address code review comments - make sure we're escaping XML correctly, and test that - remove dedent_tabs function - add a test for correctly encoding 'é'
unified-font-object · Aug 3, 2021 · 6d2e69c · 6d2e69c
1 parent 186dbc1
commit 6d2e69c
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 148 deletions.
diff --git a/src/ufonormalizer/__init__.py b/src/ufonormalizer/__init__.py
@@ -4,11 +4,9 @@
 import binascii
 import time
 import os
-import re
 import shutil
 from xml.etree import cElementTree as ET
 import plistlib
-import textwrap
 import datetime
 import glob
 from collections import OrderedDict
@@ -838,9 +836,7 @@ def _normalizeGlifNote(element, writer):
         return
     if not value.strip():
         return
-    writer.beginElement("note")
-    writer.text(value)
-    writer.endElement("note")
+    writer.simpleElement("note", value=xmlEscapeText(value))
 
 
 def _normalizeGlifOutlineFormat1(element, writer):
@@ -1261,29 +1257,6 @@ def data(self, text):
         line = "<![CDATA[%s]]>" % text
         self.raw(line)
 
-    def text(self, text):
-        text = text.strip("\n")
-        text = dedent_tabs(text)
-        text = text.strip()
-        text = xmlEscapeText(text)
-        paragraphs = []
-        for paragraph in text.splitlines():
-            if not paragraph:
-                paragraphs.append("")
-            else:
-                paragraph = textwrap.wrap(
-                    paragraph.rstrip(),
-                    width=xmlTextMaxLineLength,
-                    expand_tabs=False,
-                    replace_whitespace=False,
-                    drop_whitespace=False,
-                    break_long_words=False,
-                    break_on_hyphens=False
-                )
-                paragraphs.extend(paragraph)
-        for line in paragraphs:
-            self.raw(line)
-
     def simpleElement(self, tag, attrs=None, value=None):
         if attrs:
             attrs = self.attributesToString(attrs)
@@ -1450,63 +1423,6 @@ def xmlConvertInt(value):
     return str(value)
 
 
-# ---------------
-# Text Operations
-# ---------------
-
-WHITESPACE_ONLY_RE = re.compile(r'^[\s\t]+$', re.MULTILINE)
-LEADING_WHITESPACE_RE = re.compile(r'(^(?:\s{4}|\t)*)(?:[^\t\n])', re.MULTILINE)
-
-
-def dedent_tabs(text):
-    """
-    Based on `textwrap.dedent`, but modified to only work on tabs and 4-space indents
-
-    Remove any common leading tabs from every line in `text`.
-    This can be used to make triple-quoted strings line up with the left
-    edge of the display, while still presenting them in the source code
-    in indented form.
-
-    Entirely blank lines are normalized to a newline character.
-    """
-    # Look for the longest leading string of spaces and tabs common to
-    # all lines.
-    margin = None
-    text = WHITESPACE_ONLY_RE.sub('', text)
-    indents = LEADING_WHITESPACE_RE.findall(text)
-    for indent in indents:
-        if margin is None:
-            margin = indent
-
-        # Current line more deeply indented than previous winner:
-        # no change (previous winner is still on top).
-        elif indent.startswith(margin):
-            pass
-
-        # Current line consistent with and no deeper than previous winner:
-        # it's the new winner.
-        elif margin.startswith(indent):
-            margin = indent
-
-        # Find the largest common whitespace between current line and previous
-        # winner.
-        else:
-            for i, (x, y) in enumerate(zip(margin, indent)):
-                if x != y:
-                    margin = margin[:i]
-                    break
-
-    # sanity check (testing/debugging only)
-    if 0 and margin:
-        for line in text.split("\n"):
-            assert not line or line.startswith(margin), \
-                   "line = %r, margin = %r" % (line, margin)
-
-    if margin:
-        text = re.sub(r'(?m)^' + margin, '', text)
-    return text
-
-
 # ---------------
 # Path Operations
 # ---------------

diff --git a/tests/data/glif/format2.glif b/tests/data/glif/format2.glif
@@ -35,7 +35,5 @@
 			<string>1,0,0,0.5</string>
 		</dict>
 	</lib>
-	<note>
-		arbitrary text about the glyph
-	</note>
+	<note>arbitrary text about the glyph</note>
 </glyph>
diff --git a/tests/test_ufonormalizer.py b/tests/test_ufonormalizer.py
@@ -107,9 +107,7 @@
             <string>1,0,0,0.5</string>
         </dict>
     </lib>
-    <note>
-        arbitrary text about the glyph
-    </note>
+    <note>arbitrary text about the glyph</note>
 </glyph>
 '''
 
@@ -767,25 +765,35 @@ def test_normalizeGLIF_lib_undefined(self):
         self.assertEqual(writer.getText(), '')
 
     def test_normalizeGLIF_note_defined(self):
+        """ Serialization of notes is non-fancy: we take the note text and
+        use it, unchanged, as the body of the <note>element</note>. In previous
+        version of ufonormalizer we would break the user text into lines. See
+        https://github.com/unified-font-object/ufoNormalizer/issues/85 for some
+        background.
+        """
+
         element = ET.fromstring("<note>Blah</note>")
         writer = XMLWriter(declaration=None)
         _normalizeGlifNote(element, writer)
-        self.assertEqual(writer.getText(), "<note>\n\tBlah\n</note>")
+        self.assertEqual(writer.getText(), "<note>Blah</note>")
 
-        element = ET.fromstring("<note>   Blah  \t\n\t  </note>")
+        # encode accent correctly
+        element = ET.fromstring(
+             tobytes("<note>Don't forget to check the béziers!!</note>",
+                     encoding="utf8"))
         writer = XMLWriter(declaration=None)
         _normalizeGlifNote(element, writer)
-        self.assertEqual(writer.getText(), "<note>\n\tBlah\n</note>")
+        self.assertEqual(
+             writer.getText(),
+             "<note>Don't forget to check the b\xe9ziers!!</note>")
 
-        element = ET.fromstring(
-            tobytes("<note>Don't forget to check the béziers!!</note>",
-                    encoding="utf8"))
+        # trailing whitespace is preserved
+        element = ET.fromstring("<note>   Blah  \t\n\t  </note>")
         writer = XMLWriter(declaration=None)
         _normalizeGlifNote(element, writer)
-        self.assertEqual(
-            writer.getText(),
-            "<note>\n\tDon't forget to check the b\xe9ziers!!\n</note>")
+        self.assertEqual(writer.getText(), "<note>   Blah  \t\n\t  </note>")
 
+        # multiline strings are preserved
         element = ET.fromstring(
             tobytes("<note>A quick brown fox jumps over the lazy dog.\n"
                     "Příliš žluťoučký kůň úpěl ďábelské ódy.</note>",
@@ -794,64 +802,24 @@ def test_normalizeGLIF_note_defined(self):
         _normalizeGlifNote(element, writer)
         self.assertEqual(
             writer.getText(),
-            "<note>\n\tA quick brown fox jumps over the lazy dog.\n\t"
+            "<note>A quick brown fox jumps over the lazy dog.\n"
             "P\u0159\xedli\u0161 \u017elu\u0165ou\u010dk\xfd k\u016f\u0148 "
-            "\xfap\u011bl \u010f\xe1belsk\xe9 \xf3dy.\n</note>")
-
-        element = ET.fromstring(
-            "<note>   Line1  \t\n\n    Line3\t  </note>")
-        writer = XMLWriter(declaration=None)
-        _normalizeGlifNote(element, writer)
-        self.assertEqual(
-            writer.getText(),
-            "<note>\n\tLine1\n\t\n\t    Line3\n</note>")
-
-        # Normalizer should not indent Line2 and Line3 more than already indented
-        element = ET.fromstring(
-            "<note>\n\tLine1\n\tLine2\n\tLine3\n</note>")
-        writer = XMLWriter(declaration=None)
-        _normalizeGlifNote(element, writer)
-        self.assertEqual(
-            writer.getText(),
-            "<note>\n\tLine1\n\tLine2\n\tLine3\n</note>")
-
-        # Normalizer should keep the extra tab in line 2
-        element = ET.fromstring(
-            "<note>\n\tLine1\n\t\tLine2\n\tLine3\n</note>")
-        writer = XMLWriter(declaration=None)
-        _normalizeGlifNote(element, writer)
-        self.assertEqual(
-            writer.getText(),
-            "<note>\n\tLine1\n\t\tLine2\n\tLine3\n</note>")
+            "\xfap\u011bl \u010f\xe1belsk\xe9 \xf3dy.</note>")
 
-        # Normalizer should keep the extra spaces on line 2
+        # Everything is always preserved
         element = ET.fromstring(
-            "<note>\n\tLine1\n\t    Line2\n\tLine3\n</note>")
+            "<note>\n\tLine1\n\t\tLine2\n\t    Line3\n</note>")
         writer = XMLWriter(declaration=None)
         _normalizeGlifNote(element, writer)
         self.assertEqual(
             writer.getText(),
-            "<note>\n\tLine1\n\t    Line2\n\tLine3\n</note>")
+            "<note>\n\tLine1\n\t\tLine2\n\t    Line3\n</note>")
 
-        # Normalizer should remove the extra tab all lines have in common,
-        # but leave the additional tab on line 2
-        element = ET.fromstring(
-            "<note>\n\t\tLine1\n\t\t\tLine2\n\t\tLine3\n</note>")
+        # correctly escape xml
+        element = ET.fromstring("<note>escape&lt;br /&gt;me!</note>")
         writer = XMLWriter(declaration=None)
         _normalizeGlifNote(element, writer)
-        self.assertEqual(
-            writer.getText(),
-            "<note>\n\tLine1\n\t\tLine2\n\tLine3\n</note>")
-
-        # Normalizer should remove the extra 4-space all lines have in common,
-        # but leave the additional 4-space on line 2
-        element = ET.fromstring(
-            "<note>\n        Line1\n            Line2\n        Line3\n</note>")
-        writer = XMLWriter(declaration=None)
-        _normalizeGlifNote(element, writer)
-        self.assertEqual(
-            writer.getText(),
-            "<note>\n\tLine1\n\t    Line2\n\tLine3\n</note>")
+        self.assertEqual(writer.getText(), "<note>escape&lt;br /&gt;me!</note>")
 
     def test_normalizeGLIF_note_undefined(self):
         element = ET.fromstring("<note></note>")