From 2a43b4f55cfb490d5b052630590b27893ef7f30f Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Thu, 16 May 2024 12:29:14 -0400 Subject: [PATCH] More mixed encoding errors --- src/prism.c | 8 ++++++++ src/util/pm_strpbrk.c | 38 ++++++++++++++++++++++++++++++++------ 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/src/prism.c b/src/prism.c index 6befa6bab14..be6b52f5b1b 100644 --- a/src/prism.c +++ b/src/prism.c @@ -15382,6 +15382,10 @@ parse_string_part(pm_parser_t *parser) { // "aaa #{bbb} #@ccc ddd" // ^^^^^^ case PM_TOKEN_EMBEXPR_BEGIN: { + // Ruby disallows seeing encoding around interpolation in strings, + // even though it is known at parse time. + parser->explicit_encoding = NULL; + pm_lex_state_t state = parser->lex_state; int brace_nesting = parser->brace_nesting; @@ -15414,6 +15418,10 @@ parse_string_part(pm_parser_t *parser) { // "aaa #{bbb} #@ccc ddd" // ^^^^^ case PM_TOKEN_EMBVAR: { + // Ruby disallows seeing encoding around interpolation in strings, + // even though it is known at parse time. + parser->explicit_encoding = NULL; + lex_state_set(parser, PM_LEX_STATE_BEG); parser_lex(parser); diff --git a/src/util/pm_strpbrk.c b/src/util/pm_strpbrk.c index 6c8dea18367..916a4cc3fd3 100644 --- a/src/util/pm_strpbrk.c +++ b/src/util/pm_strpbrk.c @@ -8,6 +8,27 @@ pm_strpbrk_invalid_multibyte_character(pm_parser_t *parser, const uint8_t *start pm_diagnostic_list_append_format(&parser->error_list, start, end, PM_ERR_INVALID_MULTIBYTE_CHARACTER, *start); } +/** + * Set the explicit encoding for the parser to the current encoding. + */ +static inline void +pm_strpbrk_explicit_encoding_set(pm_parser_t *parser, const uint8_t *source, size_t width) { + if (parser->explicit_encoding != NULL) { + if (parser->explicit_encoding == parser->encoding) { + // Okay, we already locked to this encoding. + } else if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { + // Not okay, we already found a Unicode escape sequence and this + // conflicts. + pm_diagnostic_list_append_format(&parser->error_list, source, source + width, PM_ERR_MIXED_ENCODING, parser->encoding->name); + } else { + // Should not be anything else. + assert(false && "unreachable"); + } + } + + parser->explicit_encoding = parser->encoding; +} + /** * This is the default path. */ @@ -52,7 +73,7 @@ pm_strpbrk_utf8(pm_parser_t *parser, const uint8_t *source, const uint8_t *chars * This is the path when the encoding is ASCII-8BIT. */ static inline const uint8_t * -pm_strpbrk_ascii_8bit(const uint8_t *source, const uint8_t *charset, size_t maximum) { +pm_strpbrk_ascii_8bit(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) { size_t index = 0; while (index < maximum) { @@ -60,6 +81,7 @@ pm_strpbrk_ascii_8bit(const uint8_t *source, const uint8_t *charset, size_t maxi return source + index; } + if (validate && source[index] >= 0x80) pm_strpbrk_explicit_encoding_set(parser, source, 1); index++; } @@ -72,6 +94,7 @@ pm_strpbrk_ascii_8bit(const uint8_t *source, const uint8_t *charset, size_t maxi static inline const uint8_t * pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) { size_t index = 0; + const pm_encoding_t *encoding = parser->encoding; while (index < maximum) { if (strchr((const char *) charset, source[index]) != NULL) { @@ -81,7 +104,8 @@ pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t if (source[index] < 0x80) { index++; } else { - size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index)); + size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index)); + if (validate) pm_strpbrk_explicit_encoding_set(parser, source, width); if (width > 0) { index += width; @@ -96,7 +120,7 @@ pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t do { index++; - } while (index < maximum && parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0); + } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0); pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index); } @@ -113,6 +137,7 @@ pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t static inline const uint8_t * pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) { size_t index = 0; + const pm_encoding_t *encoding = parser->encoding; while (index < maximum) { if (strchr((const char *) charset, source[index]) != NULL) { @@ -122,7 +147,8 @@ pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t if (source[index] < 0x80 || !validate) { index++; } else { - size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index)); + size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index)); + pm_strpbrk_explicit_encoding_set(parser, source, width); if (width > 0) { index += width; @@ -135,7 +161,7 @@ pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t do { index++; - } while (index < maximum && parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0); + } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0); pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index); } @@ -171,7 +197,7 @@ pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, p } else if (!parser->encoding_changed) { return pm_strpbrk_utf8(parser, source, charset, (size_t) length, validate); } else if (parser->encoding == PM_ENCODING_ASCII_8BIT_ENTRY) { - return pm_strpbrk_ascii_8bit(source, charset, (size_t) length); + return pm_strpbrk_ascii_8bit(parser, source, charset, (size_t) length, validate); } else if (parser->encoding->multibyte) { return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length, validate); } else {