Skip to content

Commit

Permalink
improve: Escape square brackets less (#464)
Browse files Browse the repository at this point in the history
  • Loading branch information
hukkin authored Oct 26, 2024
1 parent 0b8fa44 commit 1046b03
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 2 deletions.
2 changes: 2 additions & 0 deletions docs/users/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ Note that there is currently no guarantee for a stable Markdown formatting style
- Added
- Plugin interface: `mdformat.plugins.ParserExtensionInterface.add_cli_argument_group`.
With this plugins can now read CLI arguments merged with values from `.mdformat.toml`.
- Changed
- Style: No longer escape square bracket enclosures.
- Improved
- Plugin interface: A trailing newline is added to fenced code blocks if a plugin fails to add it.

Expand Down
5 changes: 3 additions & 2 deletions src/mdformat/renderer/_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
decimalify_leading,
decimalify_trailing,
escape_asterisk_emphasis,
escape_square_brackets,
escape_underscore_emphasis,
get_list_marker_type,
is_tight_list,
Expand Down Expand Up @@ -115,8 +116,8 @@ def text(node: RenderTreeNode, context: RenderContext) -> str:

text = escape_asterisk_emphasis(text) # Escape emphasis/strong marker.
text = escape_underscore_emphasis(text) # Escape emphasis/strong marker.
text = text.replace("[", "\\[") # Escape link label enclosure
text = text.replace("]", "\\]") # Escape link label enclosure
# Escape link label and link ref enclosures
text = escape_square_brackets(text, context.env["used_refs"])
text = text.replace("<", "\\<") # Escape URI enclosure
text = text.replace("`", "\\`") # Escape code span marker

Expand Down
68 changes: 68 additions & 0 deletions src/mdformat/renderer/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,3 +186,71 @@ def decimalify_trailing(char_set: Iterable[str], text: str) -> str:
if last_char in char_set:
return f"{text[:-1]}&#{ord(last_char)};"
return text


def split_at_indexes(text: str, indexes: Iterable[int]) -> list[str]:
"""Return the text in parts.
Make splits right before the indexed character.
"""
if not indexes:
raise ValueError("indexes must not be empty")
parts = []
prev_i = 0
for i in sorted(indexes):
parts.append(text[prev_i:i])
prev_i = i
parts.append(text[i:])
return parts


def escape_square_brackets(text: str, used_refs: Iterable[str]) -> str:
"""Return the input string with square brackets ("[" and "]") escaped in a
safe way that avoids unintended link labels or refs after formatting.
Heuristic to use:
Escape all square brackets unless all the following are true for
a closed pair of brackets ([ + text + ]):
- the brackets enclose text containing no square brackets
- the text is not a used_ref (a link label used in a valid link or image)
- the enclosure is not followed by ":" or "(" (I believe that this, rather
than requiring the enclosure to be followed by a character other than
":" or "(", should be sufficient, as no inline other than 'text' can
start with ":" or "(", and a following text inline never exists as it
would be included in the same token.
"""
escape_before_pos = []
pos = 0
enclosure_start: int | None = None
while True:
bracket_match = RE_SQUARE_BRACKET.search(text, pos)
if not bracket_match: # pragma: >=3.10 cover
if enclosure_start is not None:
escape_before_pos.append(enclosure_start)
break

bracket = bracket_match.group()
bracket_pos = bracket_match.start()
pos = bracket_pos + 1
if bracket == "[":
if enclosure_start is not None:
escape_before_pos.append(enclosure_start)
enclosure_start = bracket_pos
else:
if enclosure_start is None:
escape_before_pos.append(bracket_pos)
else:
enclosed = text[enclosure_start + 1 : bracket_pos]
next_char = text[bracket_pos + 1 : bracket_pos + 2] # can be empty str
if enclosed.upper() not in used_refs and next_char not in {":", "("}:
enclosure_start = None
else:
escape_before_pos.append(enclosure_start)
escape_before_pos.append(bracket_pos)
enclosure_start = None
if not escape_before_pos:
return text
return "\\".join(split_at_indexes(text, escape_before_pos))


RE_SQUARE_BRACKET = re.compile(r"[\[\]]")
31 changes: 31 additions & 0 deletions tests/data/default_style.md
Original file line number Diff line number Diff line change
Expand Up @@ -464,3 +464,34 @@ Unicode space (U+3000) after heading
.
# hoge
.

Square bracket escapes
.
[no-escape]no [no-escape] no [\[\]](/url)

[escape]

[inline\](/url)

[link-label]

[link-label\]

[link-label\]: /url

[link-label]: /url
.
[no-escape]no [no-escape] no [[]](/url)

[escape]

\[inline\](/url)

[link-label]

\[link-label\]

\[link-label\]: /url

[link-label]: /url
.
26 changes: 26 additions & 0 deletions tests/test_renderer_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import pytest

from mdformat.renderer import _util


@pytest.mark.parametrize(
"in_, indexes, out",
[
("text", [0], ["", "text"]),
("text", [3], ["tex", "t"]),
("text", [4], ["text", ""]),
(
"lorem dipsum iksum lopsum",
[0, 1, 6, 7],
["", "l", "orem ", "d", "ipsum iksum lopsum"],
),
],
)
def test_split_at_indexes(in_, indexes, out):
assert _util.split_at_indexes(in_, indexes) == out


def test_split_at_indexes__valueerror():
with pytest.raises(ValueError) as exc_info:
_util.split_at_indexes("testtext", ())
assert "indexes must not be empty" in str(exc_info.value)

0 comments on commit 1046b03

Please sign in to comment.