Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MacroString parser fixes (number, SeString-to-SeString, invalid sequence) #101

Merged
merged 1 commit into from
Nov 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 53 additions & 7 deletions src/Lumina.Tests/SeStringBuilderTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -299,13 +299,13 @@ public void ComplicatedTest()
}

[Sheet( "Addon" )]
public readonly struct Addon( ExcelPage page, uint offset, uint row ) : IExcelRow<Addon>
public readonly struct Addon( ExcelPage page, uint offset, uint row ) : IExcelRow< Addon >
{
public uint RowId => row;

public ReadOnlySeString Text => page.ReadString( offset, offset );

static Addon IExcelRow<Addon>.Create( ExcelPage page, uint offset, uint row ) =>
static Addon IExcelRow< Addon >.Create( ExcelPage page, uint offset, uint row ) =>
new( page, offset, row );
}

Expand Down Expand Up @@ -458,7 +458,7 @@ public unsafe void SpanViewNullTerminationTest()
var span = test.GetViewAsSpan();
Assert.True( span.SequenceEqual( expected ) );
fixed( byte* p = span )
Assert.Equal( 0 , p[ span.Length ]);
Assert.Equal( 0, p[ span.Length ] );
}

[Fact]
Expand Down Expand Up @@ -605,14 +605,60 @@ public void FriendlyErrorMessage()
}
}

[Fact]
public void ParseNestedSeStringPayloadTest()
{
var t = ReadOnlySeString.FromMacroString( "ABC\\<italic(1)>DEF<italic(0)>" );
t = ReadOnlySeString.FromMacroString( t, new() { CharEnumerationFlags = UtfEnumeratorFlags.Utf8SeString } );
Assert.Equal( ReadOnlySeString.FromMacroString( "ABC<italic(1)>DEF<italic(0)>" ), t );
}

[Fact]
public void ParseIgnoreInvalidUtf8SequenceTest()
{
var invalidSequence = new byte[]
{
(byte) 'A',
(byte) 'B',
0xFF, // 0xFF is never valid in UTF-8 bytes
(byte) 'C',
(byte) 'D',
};
Assert.Throws< EncoderFallbackException >( () =>
ReadOnlySeString.FromMacroString( invalidSequence, new() { CharEnumerationFlags = UtfEnumeratorFlags.ThrowOnFirstError } ) );
Assert.Equal(
ReadOnlySeString.FromMacroString( invalidSequence, new() { CharEnumerationFlags = UtfEnumeratorFlags.IgnoreErrors } ).Data.ToArray(),
"ABCD"u8.ToArray() );
}

[Fact]
public void ParseNumberTest()
{
static void Test( string numberString, uint expected )
{
var e = ReadOnlySeString.FromMacroString( $"<italic({numberString})>" ).AsSpan().GetEnumerator();
Assert.True( e.MoveNext() );
Assert.Equal( MacroCode.Italic, e.Current.MacroCode );
Assert.True( e.Current.TryGetExpression( out var expr ) );
Assert.True( expr.TryGetUInt( out var parsed ) );
Assert.Equal( parsed, expected );
}

Test( "0_00'0012'345", 12345 );
Test( "0o000_5151", 2665 );
Test( "0b0000'1111'0000'1111", 0x0F0F );
Test( "0x1234_5678", 0x12345678 );
Test( "0d_5555", 5555 );
}

[RequiresGameInstallationFact]
public void AllSheetsTextColumnCodec()
{
var gameData = RequiresGameInstallationFact.CreateGameData();
var ssb = new SeStringBuilder();
foreach( var sheetName in gameData.Excel.SheetNames )
{
var header = gameData.GetFile<ExcelHeaderFile>( $"exd/{sheetName}.exh" );
var header = gameData.GetFile< ExcelHeaderFile >( $"exd/{sheetName}.exh" );
if( header?.Header.Variant == ExcelVariant.Subrows )
continue;
var languages = header?.Languages ?? [Language.None];
Expand All @@ -625,7 +671,7 @@ public void AllSheetsTextColumnCodec()
{
foreach( var columnOffset in stringColumns )
{
var test1 = row.ReadString(columnOffset);
var test1 = row.ReadString( columnOffset );
if( test1.Data.Span.IndexOf( "payload:"u8 ) != -1 )
throw new( $"Unsupported payload at {sheetName}#{row.RowId}; {test1}" );

Expand All @@ -647,14 +693,14 @@ public void AllSheetsTextColumnCodec()
}

[Sheet]
public readonly struct RawRow( ExcelPage page, uint offset, uint row ) : IExcelRow<RawRow>
public readonly struct RawRow( ExcelPage page, uint offset, uint row ) : IExcelRow< RawRow >
{
public uint RowId => row;

public ReadOnlySeString ReadString( ushort off ) =>
page.ReadString( off + offset, offset );

static RawRow IExcelRow<RawRow>.Create( ExcelPage page, uint offset, uint row ) =>
static RawRow IExcelRow< RawRow >.Create( ExcelPage page, uint offset, uint row ) =>
new( page, offset, row );
}
}
78 changes: 55 additions & 23 deletions src/Lumina/Text/Parse/MacroStringParser.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System;
using System.Diagnostics;
using System.Runtime.InteropServices;
using System.Text;
using Lumina.Text.Expressions;
Expand All @@ -9,9 +10,11 @@ namespace Lumina.Text.Parse;

internal readonly ref struct MacroStringParser
{
// Map from ascii code to supposed number.
// -1 = invalid, -2 = ignore.
// See the static constructor for initialization.
/// <summary>Map from ascii code to supposed number. See the static constructor for initialization.</summary>
/// <value><ul>
/// <li>-1: invalid</li>
/// <li>-2: ignore</li>
/// </ul></value>
private static readonly sbyte[] Digits;

private readonly ReadOnlySpan< byte > _macroString;
Expand All @@ -21,14 +24,16 @@ internal readonly ref struct MacroStringParser
static MacroStringParser()
{
Digits = new sbyte[0x80];
Digits.AsSpan().Fill(-1);
Digits['_'] = Digits['\''] = -2;
for (var i = '0'; i <= '9'; i++)
Digits[i] = (sbyte)(i - '0');
for (var i = 'A'; i <= 'F'; i++)
Digits[i] = (sbyte)(10 + (i - 'A'));
for (var i = 'a'; i <= 'f'; i++)
Digits[i] = (sbyte)(10 + (i - 'a'));
Digits.AsSpan().Fill( -1 );
// Programming languages such as C# will ignore underscores(_) between digits, to let users write 0x0123_4567_89AB_CDEF for ease of reading.
// C++ will use single quotes(') instead of underscores.
Digits[ '_' ] = Digits[ '\'' ] = -2;
for( var i = '0'; i <= '9'; i++ )
Digits[ i ] = (sbyte) ( i - '0' );
for( var i = 'A'; i <= 'F'; i++ )
Digits[ i ] = (sbyte) ( 10 + ( i - 'A' ) );
for( var i = 'a'; i <= 'f'; i++ )
Digits[ i ] = (sbyte) ( 10 + ( i - 'a' ) );
}

internal MacroStringParser( ReadOnlySpan< byte > macroString, SeStringBuilder builder, MacroStringParseOptions parseOptions )
Expand All @@ -39,25 +44,44 @@ internal MacroStringParser( ReadOnlySpan< byte > macroString, SeStringBuilder bu
}

/// <summary>Parses the macro string.</summary>
/// <returns>The builder.</returns>
/// <param name="offset">Offset in <see cref="_macroString"/> to parse from.</param>
/// <param name="stopOnCharRequiringEscape">Whether to stop parsing if a character requires escaping to have itself skipped from being processed as a part
/// of string representation of SeString payloads. Used to allow using special characters used to form string representation of SeString payloads, such as
/// <c>(</c> or <c>,</c>, when the string being parsed is at the topmost level (not a part of string SeString expression.)</param>
/// <param name="extraTerminators">If any of the bytes in this span is encountered while parsing, it will be treated as the end of the current string being
/// parsed. Used to terminate parsing string SeString expressions, so that it can exclude <c>)</c> from the expression and stop when parsing
/// <c>&lt;string(asdf)&gt;</c>, instead of producing <c>asdf)</c> as the parsed string SeString expression and fail with invalid syntax.</param>
/// <returns>One past the final offset in <see cref="_macroString"/> that got parsed.</returns>
public int ParseMacroStringAndAppend( int offset, bool stopOnCharRequiringEscape, ReadOnlySpan< byte > extraTerminators )
{
var beginOffset = offset;
while( new UtfEnumerator( _macroString[ offset.. ], _parseOptions.CharEnumerationFlags ).TryPeekNext( out var s, out _ ) )
while( new UtfEnumerator( _macroString[ offset.. ], _parseOptions.CharEnumerationFlags ).TryPeekNext( out var c, out _ ) )
{
if( s.IsSeStringPayload )
Debug.Assert(
( _parseOptions.CharEnumerationFlags & UtfEnumeratorFlags.IgnoreErrors ) != 0 || c.ByteOffset == 0,
$"Offset of the first item retrieved UtfEnumerator should have been 0, unless {nameof( UtfEnumeratorFlags.IgnoreErrors )} is set." );

offset += c.ByteOffset;

if( c.IsSeStringPayload )
{
_builder.Append( new ReadOnlySeStringSpan( _macroString.Slice( offset + s.ByteOffset, s.ByteLength ) ) );
Debug.Assert( ( _parseOptions.CharEnumerationFlags & UtfEnumeratorFlags.Utf8SeString ) != 0,
$"SeString Payload should have not been yielded unless {nameof( UtfEnumeratorFlags.Utf8SeString )} is set." );

_builder.Append( new ReadOnlySeStringSpan( _macroString.Slice( offset, c.ByteLength ) ) );
offset += c.ByteLength;
continue;
}

switch( s.Value.UIntValue )
switch( c.Value.UIntValue )
{
case '\\':
// Backslashes will *always* produce the following character as-is.
// No special escape sequences such as \n and \t are defined for SeStrings.
offset += ParseMacroStringTextAndAppend( offset, extraTerminators );
break;

case <= byte.MaxValue when extraTerminators.Contains( (byte) s.Value.UIntValue ):
case <= byte.MaxValue when extraTerminators.Contains( (byte) c.Value.UIntValue ):
return offset - beginOffset;

case '<' when _parseOptions.ExceptionMode is MacroStringParseExceptionMode.Throw:
Expand All @@ -71,7 +95,7 @@ public int ParseMacroStringAndAppend( int offset, bool stopOnCharRequiringEscape
}
catch( MacroStringParseException e )
{
var byteLength = Math.Max( s.ByteLength, e.ByteOffset - offset );
var byteLength = Math.Max( c.ByteLength, e.ByteOffset - offset );
var sliceUntilError = _macroString.Slice( offset, byteLength );
_builder.Append( new UtfEnumerator( sliceUntilError, _parseOptions.CharEnumerationFlags ) );
if( _parseOptions.ExceptionMode == MacroStringParseExceptionMode.EmbedError )
Expand All @@ -81,13 +105,13 @@ public int ParseMacroStringAndAppend( int offset, bool stopOnCharRequiringEscape

break;

case <= byte.MaxValue when CharRequiresEscapeInSeString( s.Value.UIntValue ):
case <= byte.MaxValue when CharRequiresEscapeInSeString( c.Value.UIntValue ):
if( stopOnCharRequiringEscape )
return offset - beginOffset;

var v = unchecked( (byte) s.Value.UIntValue );
var v = unchecked( (byte) c.Value.UIntValue );
_builder.Append( MemoryMarshal.CreateReadOnlySpan( ref v, 1 ) );
offset += s.ByteLength;
offset += c.ByteLength;
break;

default:
Expand All @@ -104,6 +128,9 @@ private int ParseMacroStringTextAndAppend( int offset, ReadOnlySpan< byte > extr
var nextIsEscaped = false;
foreach( var c in new UtfEnumerator( _macroString[ offset.. ], _parseOptions.CharEnumerationFlags ) )
{
if( c.IsSeStringPayload )
return c.ByteOffset;

switch( c.Value.UIntValue )
{
case var _ when nextIsEscaped:
Expand Down Expand Up @@ -355,7 +382,12 @@ static bool TryParseInt( ReadOnlySpan< byte > data, out int result )
} while( !data.IsEmpty );

var maxPerDigit = 10u;
if( data.Length > 2 && data[ 0 ] == '0' )

// If the number string begins with 0 followed by non-decimal digits, try parsing it as non-decimal.
if( data.Length > 2
&& data[ 0 ] == '0'
&& data[ 1 ] is not ((byte) '_' or (byte) '\'')
&& data[ 1 ] is not (>= (byte) '0' and <= (byte) '9') )
{
maxPerDigit = (char) data[ 1 ] switch
{
Expand Down Expand Up @@ -460,7 +492,7 @@ private MacroCode ParseMacroCode( ref int offset )

macroCodeName = macroCodeName[ ..macroCodeNameLength ];

foreach( var n in MacroCodeExtensions.GetDefinedMacroCodes())
foreach( var n in MacroCodeExtensions.GetDefinedMacroCodes() )
{
if( macroCodeName.SequenceEqual( n.GetEncodeName() ) )
{
Expand Down
Loading