Merge pull request #3009 from ruby/pass-unicode-escapes-to-onigmo

Pass Unicode escapes on to onigmo
ruby · Aug 23, 2024 · 49e061d · 49e061d
2 parents ca316d5 + fb98034
commit 49e061d
Show file tree

Hide file tree

Showing 6 changed files with 66 additions and 12 deletions.
diff --git a/config.yml b/config.yml
@@ -81,6 +81,7 @@ errors:
   - ESCAPE_INVALID_META_REPEAT
   - ESCAPE_INVALID_UNICODE
   - ESCAPE_INVALID_UNICODE_CM_FLAGS
+  - ESCAPE_INVALID_UNICODE_LIST
   - ESCAPE_INVALID_UNICODE_LITERAL
   - ESCAPE_INVALID_UNICODE_LONG
   - ESCAPE_INVALID_UNICODE_SHORT

diff --git a/src/prism.c b/src/prism.c
@@ -9718,11 +9718,27 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
             const uint8_t *start = parser->current.end - 1;
             parser->current.end++;
 
-            if (peek(parser) == '{') {
+            if (parser->current.end == parser->end) {
+                const uint8_t *start = parser->current.end - 2;
+                PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start);
+            } else if (peek(parser) == '{') {
                 const uint8_t *unicode_codepoints_start = parser->current.end - 2;
-
                 parser->current.end++;
-                parser->current.end += pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end);
+
+                size_t whitespace;
+                while (true) {
+                    if ((whitespace = pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end)) > 0) {
+                        parser->current.end += whitespace;
+                    } else if (peek(parser) == '\\' && peek_offset(parser, 1) == 'n') {
+                        // This is super hacky, but it gets us nicer error
+                        // messages because we can still pass it off to the
+                        // regular expression engine even if we hit an
+                        // unterminated regular expression.
+                        parser->current.end += 2;
+                    } else {
+                        break;
+                    }
+                }
 
                 const uint8_t *extra_codepoints_start = NULL;
                 int codepoints_count = 0;
@@ -9736,8 +9752,17 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
                         pm_parser_err(parser, unicode_start, unicode_start + hexadecimal_length, PM_ERR_ESCAPE_INVALID_UNICODE_LONG);
                     } else if (hexadecimal_length == 0) {
                         // there are not hexadecimal characters
-                        pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE);
-                        pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM);
+
+                        if (flags & PM_ESCAPE_FLAG_REGEXP) {
+                            // If this is a regular expression, we are going to
+                            // let the regular expression engine handle this
+                            // error instead of us.
+                            pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start));
+                        } else {
+                            pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE);
+                            pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM);
+                        }
+
                         return;
                     }
 
@@ -9759,10 +9784,19 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
                     pm_parser_err(parser, extra_codepoints_start, parser->current.end - 1, PM_ERR_ESCAPE_INVALID_UNICODE_LITERAL);
                 }
 
-                if (peek(parser) == '}') {
+                if (parser->current.end == parser->end) {
+                    PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (parser->current.end - start), start);
+                } else if (peek(parser) == '}') {
                     parser->current.end++;
                 } else {
-                    pm_parser_err(parser, unicode_codepoints_start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM);
+                    if (flags & PM_ESCAPE_FLAG_REGEXP) {
+                        // If this is a regular expression, we are going to let
+                        // the regular expression engine handle this error
+                        // instead of us.
+                        pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start));
+                    } else {
+                        pm_parser_err(parser, unicode_codepoints_start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM);
+                    }
                 }
 
                 if (flags & PM_ESCAPE_FLAG_REGEXP) {
@@ -9772,8 +9806,12 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
                 size_t length = pm_strspn_hexadecimal_digit(parser->current.end, MIN(parser->end - parser->current.end, 4));
 
                 if (length == 0) {
-                    const uint8_t *start = parser->current.end - 2;
-                    PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start);
+                    if (flags & PM_ESCAPE_FLAG_REGEXP) {
+                        pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start));
+                    } else {
+                        const uint8_t *start = parser->current.end - 2;
+                        PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start);
+                    }
                 } else if (length == 4) {
                     uint32_t value = escape_unicode(parser, parser->current.end, 4);
 
@@ -9785,7 +9823,15 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
                     parser->current.end += 4;
                 } else {
                     parser->current.end += length;
-                    pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_UNICODE);
+
+                    if (flags & PM_ESCAPE_FLAG_REGEXP) {
+                        // If this is a regular expression, we are going to let
+                        // the regular expression engine handle this error
+                        // instead of us.
+                        pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start));
+                    } else {
+                        pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_UNICODE);
+                    }
                 }
             }
 

diff --git a/templates/src/diagnostic.c.erb b/templates/src/diagnostic.c.erb
@@ -165,6 +165,7 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = {
     [PM_ERR_ESCAPE_INVALID_META_REPEAT]         = { "invalid meta escape sequence; meta cannot be repeated", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_ESCAPE_INVALID_UNICODE]             = { "invalid Unicode escape sequence", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_ESCAPE_INVALID_UNICODE_CM_FLAGS]    = { "invalid Unicode escape sequence; Unicode cannot be combined with control or meta flags", PM_ERROR_LEVEL_SYNTAX },
+    [PM_ERR_ESCAPE_INVALID_UNICODE_LIST]        = { "invalid Unicode list: %.*s", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_ESCAPE_INVALID_UNICODE_LITERAL]     = { "invalid Unicode escape sequence; Multiple codepoints at single character literal are disallowed", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_ESCAPE_INVALID_UNICODE_LONG]        = { "invalid Unicode escape sequence; maximum length is 6 digits", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_ESCAPE_INVALID_UNICODE_SHORT]       = { "too short escape sequence: %.*s", PM_ERROR_LEVEL_SYNTAX },

diff --git a/test/prism/errors/regexp_unicode_too_short.txt b/test/prism/errors/regexp_unicode_too_short.txt
@@ -1,3 +1,4 @@
-/\u/
+/\u
  ^~ too short escape sequence: \u
+^ unterminated regexp meets end of file; expected a closing delimiter
 
diff --git a/test/prism/errors/unterminated_unicode_brackets_should_be_a_syntax_error.txt b/test/prism/errors/unterminated_unicode_brackets_should_be_a_syntax_error.txt
@@ -1,3 +1,3 @@
 ?\u{3
- ^~~~ unterminated Unicode escape
+ ^~~~ invalid Unicode list: \u{3
 
diff --git a/test/prism/unescape_test.rb b/test/prism/unescape_test.rb
@@ -159,6 +159,11 @@ def assert_context(context)
         # to validate backreferences so these are all going to fail.
         next if (context.name == "//" || context.name.start_with?("%r")) && ord.chr.start_with?(/\d/)
 
+        # \u is passed directly on to the regular expression engine and it is
+        # responsible for handling syntax errors. In this case we do not check
+        # it because it would require going through the compiler.
+        next if context.is_a?(Context::RegExp) && ord.chr == "u"
+
         # \a \b \c ...
         assert_unescape(context, ord.chr)
       end