Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve: Escape square brackets less #464

Merged
merged 5 commits into from
Oct 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/users/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ Note that there is currently no guarantee for a stable Markdown formatting style
- Added
- Plugin interface: `mdformat.plugins.ParserExtensionInterface.add_cli_argument_group`.
With this plugins can now read CLI arguments merged with values from `.mdformat.toml`.
- Changed
- Style: No longer escape square bracket enclosures.
- Improved
- Plugin interface: A trailing newline is added to fenced code blocks if a plugin fails to add it.

Expand Down
5 changes: 3 additions & 2 deletions src/mdformat/renderer/_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
decimalify_leading,
decimalify_trailing,
escape_asterisk_emphasis,
escape_square_brackets,
escape_underscore_emphasis,
get_list_marker_type,
is_tight_list,
Expand Down Expand Up @@ -115,8 +116,8 @@ def text(node: RenderTreeNode, context: RenderContext) -> str:

text = escape_asterisk_emphasis(text) # Escape emphasis/strong marker.
text = escape_underscore_emphasis(text) # Escape emphasis/strong marker.
text = text.replace("[", "\\[") # Escape link label enclosure
text = text.replace("]", "\\]") # Escape link label enclosure
# Escape link label and link ref enclosures
text = escape_square_brackets(text, context.env["used_refs"])
text = text.replace("<", "\\<") # Escape URI enclosure
text = text.replace("`", "\\`") # Escape code span marker

Expand Down
68 changes: 68 additions & 0 deletions src/mdformat/renderer/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,3 +186,71 @@ def decimalify_trailing(char_set: Iterable[str], text: str) -> str:
if last_char in char_set:
return f"{text[:-1]}&#{ord(last_char)};"
return text


def split_at_indexes(text: str, indexes: Iterable[int]) -> list[str]:
"""Return the text in parts.

Make splits right before the indexed character.
"""
if not indexes:
raise ValueError("indexes must not be empty")
parts = []
prev_i = 0
for i in sorted(indexes):
parts.append(text[prev_i:i])
prev_i = i
parts.append(text[i:])
return parts


def escape_square_brackets(text: str, used_refs: Iterable[str]) -> str:
"""Return the input string with square brackets ("[" and "]") escaped in a
safe way that avoids unintended link labels or refs after formatting.

Heuristic to use:
Escape all square brackets unless all the following are true for
a closed pair of brackets ([ + text + ]):
- the brackets enclose text containing no square brackets
- the text is not a used_ref (a link label used in a valid link or image)
- the enclosure is not followed by ":" or "(" (I believe that this, rather
than requiring the enclosure to be followed by a character other than
":" or "(", should be sufficient, as no inline other than 'text' can
start with ":" or "(", and a following text inline never exists as it
would be included in the same token.
"""
escape_before_pos = []
pos = 0
enclosure_start: int | None = None
while True:
bracket_match = RE_SQUARE_BRACKET.search(text, pos)
if not bracket_match: # pragma: >=3.10 cover
if enclosure_start is not None:
escape_before_pos.append(enclosure_start)
break

bracket = bracket_match.group()
bracket_pos = bracket_match.start()
pos = bracket_pos + 1
if bracket == "[":
if enclosure_start is not None:
escape_before_pos.append(enclosure_start)
enclosure_start = bracket_pos
else:
if enclosure_start is None:
escape_before_pos.append(bracket_pos)
else:
enclosed = text[enclosure_start + 1 : bracket_pos]
next_char = text[bracket_pos + 1 : bracket_pos + 2] # can be empty str
if enclosed.upper() not in used_refs and next_char not in {":", "("}:
enclosure_start = None
else:
escape_before_pos.append(enclosure_start)
escape_before_pos.append(bracket_pos)
enclosure_start = None
if not escape_before_pos:
return text
return "\\".join(split_at_indexes(text, escape_before_pos))


RE_SQUARE_BRACKET = re.compile(r"[\[\]]")
31 changes: 31 additions & 0 deletions tests/data/default_style.md
Original file line number Diff line number Diff line change
Expand Up @@ -464,3 +464,34 @@ Unicode space (U+3000) after heading
.
# hoge
.

Square bracket escapes
.
[no-escape]no [no-escape] no [\[\]](/url)

[escape]

[inline\](/url)

[link-label]

[link-label\]

[link-label\]: /url

[link-label]: /url
.
[no-escape]no [no-escape] no [[]](/url)

[escape]

\[inline\](/url)

[link-label]

\[link-label\]

\[link-label\]: /url

[link-label]: /url
.
26 changes: 26 additions & 0 deletions tests/test_renderer_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import pytest

from mdformat.renderer import _util


@pytest.mark.parametrize(
"in_, indexes, out",
[
("text", [0], ["", "text"]),
("text", [3], ["tex", "t"]),
("text", [4], ["text", ""]),
(
"lorem dipsum iksum lopsum",
[0, 1, 6, 7],
["", "l", "orem ", "d", "ipsum iksum lopsum"],
),
],
)
def test_split_at_indexes(in_, indexes, out):
assert _util.split_at_indexes(in_, indexes) == out


def test_split_at_indexes__valueerror():
with pytest.raises(ValueError) as exc_info:
_util.split_at_indexes("testtext", ())
assert "indexes must not be empty" in str(exc_info.value)