Skip to content

Commit

Permalink
Merge pull request #3009 from ruby/pass-unicode-escapes-to-onigmo
Browse files Browse the repository at this point in the history
Pass Unicode escapes on to onigmo
  • Loading branch information
kddnewton authored Aug 23, 2024
2 parents ca316d5 + fb98034 commit 49e061d
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 12 deletions.
1 change: 1 addition & 0 deletions config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ errors:
- ESCAPE_INVALID_META_REPEAT
- ESCAPE_INVALID_UNICODE
- ESCAPE_INVALID_UNICODE_CM_FLAGS
- ESCAPE_INVALID_UNICODE_LIST
- ESCAPE_INVALID_UNICODE_LITERAL
- ESCAPE_INVALID_UNICODE_LONG
- ESCAPE_INVALID_UNICODE_SHORT
Expand Down
66 changes: 56 additions & 10 deletions src/prism.c
Original file line number Diff line number Diff line change
Expand Up @@ -9718,11 +9718,27 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
const uint8_t *start = parser->current.end - 1;
parser->current.end++;

if (peek(parser) == '{') {
if (parser->current.end == parser->end) {
const uint8_t *start = parser->current.end - 2;
PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start);
} else if (peek(parser) == '{') {
const uint8_t *unicode_codepoints_start = parser->current.end - 2;

parser->current.end++;
parser->current.end += pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end);

size_t whitespace;
while (true) {
if ((whitespace = pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end)) > 0) {
parser->current.end += whitespace;
} else if (peek(parser) == '\\' && peek_offset(parser, 1) == 'n') {
// This is super hacky, but it gets us nicer error
// messages because we can still pass it off to the
// regular expression engine even if we hit an
// unterminated regular expression.
parser->current.end += 2;
} else {
break;
}
}

const uint8_t *extra_codepoints_start = NULL;
int codepoints_count = 0;
Expand All @@ -9736,8 +9752,17 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
pm_parser_err(parser, unicode_start, unicode_start + hexadecimal_length, PM_ERR_ESCAPE_INVALID_UNICODE_LONG);
} else if (hexadecimal_length == 0) {
// there are not hexadecimal characters
pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE);
pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM);

if (flags & PM_ESCAPE_FLAG_REGEXP) {
// If this is a regular expression, we are going to
// let the regular expression engine handle this
// error instead of us.
pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start));
} else {
pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE);
pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM);
}

return;
}

Expand All @@ -9759,10 +9784,19 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
pm_parser_err(parser, extra_codepoints_start, parser->current.end - 1, PM_ERR_ESCAPE_INVALID_UNICODE_LITERAL);
}

if (peek(parser) == '}') {
if (parser->current.end == parser->end) {
PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (parser->current.end - start), start);
} else if (peek(parser) == '}') {
parser->current.end++;
} else {
pm_parser_err(parser, unicode_codepoints_start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM);
if (flags & PM_ESCAPE_FLAG_REGEXP) {
// If this is a regular expression, we are going to let
// the regular expression engine handle this error
// instead of us.
pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start));
} else {
pm_parser_err(parser, unicode_codepoints_start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM);
}
}

if (flags & PM_ESCAPE_FLAG_REGEXP) {
Expand All @@ -9772,8 +9806,12 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
size_t length = pm_strspn_hexadecimal_digit(parser->current.end, MIN(parser->end - parser->current.end, 4));

if (length == 0) {
const uint8_t *start = parser->current.end - 2;
PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start);
if (flags & PM_ESCAPE_FLAG_REGEXP) {
pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start));
} else {
const uint8_t *start = parser->current.end - 2;
PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start);
}
} else if (length == 4) {
uint32_t value = escape_unicode(parser, parser->current.end, 4);

Expand All @@ -9785,7 +9823,15 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
parser->current.end += 4;
} else {
parser->current.end += length;
pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_UNICODE);

if (flags & PM_ESCAPE_FLAG_REGEXP) {
// If this is a regular expression, we are going to let
// the regular expression engine handle this error
// instead of us.
pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start));
} else {
pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_UNICODE);
}
}
}

Expand Down
1 change: 1 addition & 0 deletions templates/src/diagnostic.c.erb
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = {
[PM_ERR_ESCAPE_INVALID_META_REPEAT] = { "invalid meta escape sequence; meta cannot be repeated", PM_ERROR_LEVEL_SYNTAX },
[PM_ERR_ESCAPE_INVALID_UNICODE] = { "invalid Unicode escape sequence", PM_ERROR_LEVEL_SYNTAX },
[PM_ERR_ESCAPE_INVALID_UNICODE_CM_FLAGS] = { "invalid Unicode escape sequence; Unicode cannot be combined with control or meta flags", PM_ERROR_LEVEL_SYNTAX },
[PM_ERR_ESCAPE_INVALID_UNICODE_LIST] = { "invalid Unicode list: %.*s", PM_ERROR_LEVEL_SYNTAX },
[PM_ERR_ESCAPE_INVALID_UNICODE_LITERAL] = { "invalid Unicode escape sequence; Multiple codepoints at single character literal are disallowed", PM_ERROR_LEVEL_SYNTAX },
[PM_ERR_ESCAPE_INVALID_UNICODE_LONG] = { "invalid Unicode escape sequence; maximum length is 6 digits", PM_ERROR_LEVEL_SYNTAX },
[PM_ERR_ESCAPE_INVALID_UNICODE_SHORT] = { "too short escape sequence: %.*s", PM_ERROR_LEVEL_SYNTAX },
Expand Down
3 changes: 2 additions & 1 deletion test/prism/errors/regexp_unicode_too_short.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/\u/
/\u
^~ too short escape sequence: \u
^ unterminated regexp meets end of file; expected a closing delimiter

Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
?\u{3
^~~~ unterminated Unicode escape
^~~~ invalid Unicode list: \u{3

5 changes: 5 additions & 0 deletions test/prism/unescape_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,11 @@ def assert_context(context)
# to validate backreferences so these are all going to fail.
next if (context.name == "//" || context.name.start_with?("%r")) && ord.chr.start_with?(/\d/)

# \u is passed directly on to the regular expression engine and it is
# responsible for handling syntax errors. In this case we do not check
# it because it would require going through the compiler.
next if context.is_a?(Context::RegExp) && ord.chr == "u"

# \a \b \c ...
assert_unescape(context, ord.chr)
end
Expand Down

0 comments on commit 49e061d

Please sign in to comment.