Skip to content

Commit

Permalink
Provide ability to lock encoding while parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
kddnewton committed Jun 10, 2024
1 parent fe34a77 commit f7faedf
Show file tree
Hide file tree
Showing 12 changed files with 66 additions and 8 deletions.
1 change: 1 addition & 0 deletions docs/serialization.md
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ The final argument to `pm_serialize_parse` is an optional string that controls t
| `1` | frozen string literal |
| `1` | command line flags |
| `1` | syntax version, see [pm_options_version_t](https://github.com/ruby/prism/blob/main/include/prism/options.h) for valid values |
| `1` | whether or not the encoding is locked (should almost always be false) |
| `4` | the number of scopes |
| ... | the scopes |

Expand Down
9 changes: 8 additions & 1 deletion ext/prism/extension.c
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,13 @@ build_options_i(VALUE key, VALUE value, VALUE argument) {
if (key_id == rb_id_option_filepath) {
if (!NIL_P(value)) pm_options_filepath_set(options, check_string(value));
} else if (key_id == rb_id_option_encoding) {
if (!NIL_P(value)) pm_options_encoding_set(options, rb_enc_name(rb_to_encoding(value)));
if (!NIL_P(value)) {
if (value == Qfalse) {
pm_options_encoding_locked_set(options, true);
} else {
pm_options_encoding_set(options, rb_enc_name(rb_to_encoding(value)));
}
}
} else if (key_id == rb_id_option_line) {
if (!NIL_P(value)) pm_options_line_set(options, NUM2INT(value));
} else if (key_id == rb_id_option_frozen_string_literal) {
Expand Down Expand Up @@ -206,6 +212,7 @@ build_options(VALUE argument) {
static void
extract_options(pm_options_t *options, VALUE filepath, VALUE keywords) {
options->line = 1; // default

if (!NIL_P(keywords)) {
struct build_options_data data = { .options = options, .keywords = keywords };
struct build_options_data *argument = &data;
Expand Down
15 changes: 15 additions & 0 deletions include/prism/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,13 @@ typedef struct {
* - PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET
*/
int8_t frozen_string_literal;

/**
* Whether or not the encoding magic comments should be respected. This is a
* niche use-case where you want to parse a file with a specific encoding
* but ignore any encoding magic comments at the top of the file.
*/
bool encoding_locked;
} pm_options_t;

/**
Expand Down Expand Up @@ -166,6 +173,14 @@ PRISM_EXPORTED_FUNCTION void pm_options_line_set(pm_options_t *options, int32_t
*/
PRISM_EXPORTED_FUNCTION void pm_options_encoding_set(pm_options_t *options, const char *encoding);

/**
* Set the encoding_locked option on the given options struct.
*
* @param options The options struct to set the encoding_locked value on.
* @param encoding_locked The encoding_locked value to set.
*/
PRISM_EXPORTED_FUNCTION void pm_options_encoding_locked_set(pm_options_t *options, bool encoding_locked);

/**
* Set the frozen string literal option on the given options struct.
*
Expand Down
8 changes: 8 additions & 0 deletions include/prism/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -860,6 +860,14 @@ struct pm_parser {
/** Whether or not we're currently recovering from a syntax error. */
bool recovering;

/**
* This is very specialized behavior for when you want to parse in a context
* that does not respect encoding comments. Its main use case is translating
* into the whitequark/parser AST which re-encodes source files in UTF-8
* before they are parsed and ignores encoding comments.
*/
bool encoding_locked;

/**
* Whether or not the encoding has been changed by a magic comment. We use
* this to provide a fast path for the lexer instead of going through the
Expand Down
2 changes: 2 additions & 0 deletions java-wasm/src/test/java/org/prism/DummyTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ public void test1() {
false,
EnumSet.noneOf(ParsingOptions.CommandLine.class),
ParsingOptions.SyntaxVersion.LATEST,
false,
new byte[][][] {}
);

Expand Down Expand Up @@ -91,6 +92,7 @@ public void test2() {
false,
EnumSet.noneOf(ParsingOptions.CommandLine.class),
ParsingOptions.SyntaxVersion.LATEST,
false,
new byte[][][] {}
);

Expand Down
7 changes: 6 additions & 1 deletion java/org/prism/ParsingOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,11 @@ public enum CommandLine { A, E, L, N, P, X };
* @param frozenStringLiteral whether the frozen string literal option has been set
* @param commandLine the set of flags that were set on the command line
* @param version code of Ruby version which syntax will be used to parse
* @param encodingLocked whether the encoding is locked (should almost always be false)
* @param scopes scopes surrounding the code that is being parsed with local variable names defined in every scope
* ordered from the outermost scope to the innermost one
*/
public static byte[] serialize(byte[] filepath, int line, byte[] encoding, boolean frozenStringLiteral, EnumSet<CommandLine> commandLine, SyntaxVersion version, byte[][][] scopes) {
public static byte[] serialize(byte[] filepath, int line, byte[] encoding, boolean frozenStringLiteral, EnumSet<CommandLine> commandLine, SyntaxVersion version, boolean encodingLocked, byte[][][] scopes) {
final ByteArrayOutputStream output = new ByteArrayOutputStream();

// filepath
Expand All @@ -69,10 +70,14 @@ public static byte[] serialize(byte[] filepath, int line, byte[] encoding, boole
// version
output.write(version.getValue());

// encodingLocked
output.write(encodingLocked ? 1 : 0);

// scopes

// number of scopes
write(output, serializeInt(scopes.length));

// local variables in each scope
for (byte[][] scope : scopes) {
// number of locals
Expand Down
3 changes: 3 additions & 0 deletions javascript/src/parsePrism.js
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ function dumpOptions(options) {
throw new Error(`Unsupported version '${options.version}' in compiler options`);
}

template.push("C");
values.push(options.encoding === false ? 1 : 0);

template.push("L");
if (options.scopes) {
const scopes = options.scopes;
Expand Down
3 changes: 3 additions & 0 deletions lib/prism/ffi.rb
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,9 @@ def dump_options(options)
template << "C"
values << { nil => 0, "3.3.0" => 1, "3.3.1" => 1, "3.4.0" => 0, "latest" => 0 }.fetch(options[:version])

template << "C"
values << (options[:encoding] == false ? 1 : 0)

template << "L"
if (scopes = options[:scopes])
values << scopes.length
Expand Down
6 changes: 3 additions & 3 deletions lib/prism/translation/parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def parse(source_buffer)
source = source_buffer.source

offset_cache = build_offset_cache(source)
result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]]), offset_cache)
result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]], encoding: false), offset_cache)

build_ast(result.value, offset_cache)
ensure
Expand All @@ -64,7 +64,7 @@ def parse_with_comments(source_buffer)
source = source_buffer.source

offset_cache = build_offset_cache(source)
result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]]), offset_cache)
result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]], encoding: false), offset_cache)

[
build_ast(result.value, offset_cache),
Expand All @@ -83,7 +83,7 @@ def tokenize(source_buffer, recover = false)
offset_cache = build_offset_cache(source)
result =
begin
unwrap(Prism.parse_lex(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]]), offset_cache)
unwrap(Prism.parse_lex(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]], encoding: false), offset_cache)
rescue ::Parser::SyntaxError
raise if !recover
end
Expand Down
9 changes: 9 additions & 0 deletions src/options.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@ pm_options_encoding_set(pm_options_t *options, const char *encoding) {
pm_string_constant_init(&options->encoding, encoding, strlen(encoding));
}

/**
* Set the encoding_locked option on the given options struct.
*/
PRISM_EXPORTED_FUNCTION void
pm_options_encoding_locked_set(pm_options_t *options, bool encoding_locked) {
options->encoding_locked = encoding_locked;
}

/**
* Set the line option on the given options struct.
*/
Expand Down Expand Up @@ -215,6 +223,7 @@ pm_options_read(pm_options_t *options, const char *data) {
options->frozen_string_literal = (int8_t) *data++;
options->command_line = (uint8_t) *data++;
options->version = (pm_options_version_t) *data++;
options->encoding_locked = ((uint8_t) *data++) > 0;

uint32_t scopes_count = pm_options_read_u32(data);
data += 4;
Expand Down
10 changes: 8 additions & 2 deletions src/prism.c
Original file line number Diff line number Diff line change
Expand Up @@ -8261,7 +8261,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {

// We only want to attempt to compare against encoding comments if it's
// the first line in the file (or the second in the case of a shebang).
if (parser->current.start == parser->encoding_comment_start) {
if (parser->current.start == parser->encoding_comment_start && !parser->encoding_locked) {
if (
(key_length == 8 && pm_strncasecmp(key_source, (const uint8_t *) "encoding", 8) == 0) ||
(key_length == 6 && pm_strncasecmp(key_source, (const uint8_t *) "coding", 6) == 0)
Expand Down Expand Up @@ -10438,7 +10438,9 @@ parser_lex(pm_parser_t *parser) {
// pass and we're at the start of the file, then we need
// to do another pass to potentially find other patterns
// for encoding comments.
if (length >= 10) parser_lex_magic_comment_encoding(parser);
if (length >= 10 && !parser->encoding_locked) {
parser_lex_magic_comment_encoding(parser);
}
}

lexed_comment = true;
Expand Down Expand Up @@ -21244,6 +21246,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
.parsing_eval = false,
.command_start = true,
.recovering = false,
.encoding_locked = false,
.encoding_changed = false,
.pattern_matching_newlines = false,
.in_keyword_arg = false,
Expand Down Expand Up @@ -21291,6 +21294,9 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
parser_lex_magic_comment_encoding_value(parser, encoding_source, encoding_source + encoding_length);
}

// encoding_locked option
parser->encoding_locked = options->encoding_locked;

// frozen_string_literal option
parser->frozen_string_literal = options->frozen_string_literal;

Expand Down
1 change: 0 additions & 1 deletion test/prism/ruby/parser_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ class ParserTest < TestCase
# skip them for now.
skip_all = skip_incorrect | [
"regex.txt",
"regex_char_width.txt",
"unescaping.txt",
"seattlerb/bug190.txt",
"seattlerb/heredoc_with_extra_carriage_returns_windows.txt",
Expand Down

0 comments on commit f7faedf

Please sign in to comment.