improve: Escape square brackets less (#464)

hukkin · Oct 26, 2024 · 1046b03 · 1046b03
1 parent 0b8fa44
commit 1046b03
Show file tree

Hide file tree

Showing 5 changed files with 130 additions and 2 deletions.
diff --git a/docs/users/changelog.md b/docs/users/changelog.md
@@ -11,6 +11,8 @@ Note that there is currently no guarantee for a stable Markdown formatting style
 - Added
   - Plugin interface: `mdformat.plugins.ParserExtensionInterface.add_cli_argument_group`.
     With this plugins can now read CLI arguments merged with values from `.mdformat.toml`.
+- Changed
+  - Style: No longer escape square bracket enclosures.
 - Improved
   - Plugin interface: A trailing newline is added to fenced code blocks if a plugin fails to add it.
 

diff --git a/src/mdformat/renderer/_context.py b/src/mdformat/renderer/_context.py
@@ -18,6 +18,7 @@
     decimalify_leading,
     decimalify_trailing,
     escape_asterisk_emphasis,
+    escape_square_brackets,
     escape_underscore_emphasis,
     get_list_marker_type,
     is_tight_list,
@@ -115,8 +116,8 @@ def text(node: RenderTreeNode, context: RenderContext) -> str:
 
     text = escape_asterisk_emphasis(text)  # Escape emphasis/strong marker.
     text = escape_underscore_emphasis(text)  # Escape emphasis/strong marker.
-    text = text.replace("[", "\\[")  # Escape link label enclosure
-    text = text.replace("]", "\\]")  # Escape link label enclosure
+    # Escape link label and link ref enclosures
+    text = escape_square_brackets(text, context.env["used_refs"])
     text = text.replace("<", "\\<")  # Escape URI enclosure
     text = text.replace("`", "\\`")  # Escape code span marker
 

diff --git a/src/mdformat/renderer/_util.py b/src/mdformat/renderer/_util.py
@@ -186,3 +186,71 @@ def decimalify_trailing(char_set: Iterable[str], text: str) -> str:
     if last_char in char_set:
         return f"{text[:-1]}&#{ord(last_char)};"
     return text
+
+
+def split_at_indexes(text: str, indexes: Iterable[int]) -> list[str]:
+    """Return the text in parts.
+
+    Make splits right before the indexed character.
+    """
+    if not indexes:
+        raise ValueError("indexes must not be empty")
+    parts = []
+    prev_i = 0
+    for i in sorted(indexes):
+        parts.append(text[prev_i:i])
+        prev_i = i
+    parts.append(text[i:])
+    return parts
+
+
+def escape_square_brackets(text: str, used_refs: Iterable[str]) -> str:
+    """Return the input string with square brackets ("[" and "]") escaped in a
+    safe way that avoids unintended link labels or refs after formatting.
+
+    Heuristic to use:
+    Escape all square brackets unless all the following are true for
+    a closed pair of brackets ([ + text + ]):
+    - the brackets enclose text containing no square brackets
+    - the text is not a used_ref (a link label used in a valid link or image)
+    - the enclosure is not followed by ":" or "(" (I believe that this, rather
+      than requiring the enclosure to be followed by a character other than
+      ":" or "(", should be sufficient, as no inline other than 'text' can
+      start with ":" or "(", and a following text inline never exists as it
+      would be included in the same token.
+    """
+    escape_before_pos = []
+    pos = 0
+    enclosure_start: int | None = None
+    while True:
+        bracket_match = RE_SQUARE_BRACKET.search(text, pos)
+        if not bracket_match:  # pragma: >=3.10 cover
+            if enclosure_start is not None:
+                escape_before_pos.append(enclosure_start)
+            break
+
+        bracket = bracket_match.group()
+        bracket_pos = bracket_match.start()
+        pos = bracket_pos + 1
+        if bracket == "[":
+            if enclosure_start is not None:
+                escape_before_pos.append(enclosure_start)
+            enclosure_start = bracket_pos
+        else:
+            if enclosure_start is None:
+                escape_before_pos.append(bracket_pos)
+            else:
+                enclosed = text[enclosure_start + 1 : bracket_pos]
+                next_char = text[bracket_pos + 1 : bracket_pos + 2]  # can be empty str
+                if enclosed.upper() not in used_refs and next_char not in {":", "("}:
+                    enclosure_start = None
+                else:
+                    escape_before_pos.append(enclosure_start)
+                    escape_before_pos.append(bracket_pos)
+                    enclosure_start = None
+    if not escape_before_pos:
+        return text
+    return "\\".join(split_at_indexes(text, escape_before_pos))
+
+
+RE_SQUARE_BRACKET = re.compile(r"[\[\]]")
diff --git a/tests/data/default_style.md b/tests/data/default_style.md
@@ -464,3 +464,34 @@ Unicode space (U+3000) after heading
 .
 # hoge
 .
+
+Square bracket escapes
+.
+[no-escape]no [no-escape] no [\[\]](/url)
+
+[escape]
+
+[inline\](/url)
+
+[link-label]
+
+[link-label\]
+
+[link-label\]: /url
+
+[link-label]: /url
+.
+[no-escape]no [no-escape] no [[]](/url)
+
+[escape]
+
+\[inline\](/url)
+
+[link-label]
+
+\[link-label\]
+
+\[link-label\]: /url
+
+[link-label]: /url
+.
diff --git a/tests/test_renderer_util.py b/tests/test_renderer_util.py
@@ -0,0 +1,26 @@
+import pytest
+
+from mdformat.renderer import _util
+
+
+@pytest.mark.parametrize(
+    "in_, indexes, out",
+    [
+        ("text", [0], ["", "text"]),
+        ("text", [3], ["tex", "t"]),
+        ("text", [4], ["text", ""]),
+        (
+            "lorem dipsum iksum lopsum",
+            [0, 1, 6, 7],
+            ["", "l", "orem ", "d", "ipsum iksum lopsum"],
+        ),
+    ],
+)
+def test_split_at_indexes(in_, indexes, out):
+    assert _util.split_at_indexes(in_, indexes) == out
+
+
+def test_split_at_indexes__valueerror():
+    with pytest.raises(ValueError) as exc_info:
+        _util.split_at_indexes("testtext", ())
+    assert "indexes must not be empty" in str(exc_info.value)