Skip to content

Commit

Permalink
Check for String Encoding Errors: Improvements for original games
Browse files Browse the repository at this point in the history
- Greatly improved detection of UTF-8 character codes in ANSI game strings
- Fixed repair option for ANSI game strings with UTF-8 character codes
  • Loading branch information
Argent77 committed Nov 10, 2024
1 parent b7fd7b7 commit 0c96cd6
Show file tree
Hide file tree
Showing 3 changed files with 221 additions and 37 deletions.
187 changes: 151 additions & 36 deletions src/org/infinity/check/StringValidationChecker.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderMalfunctionError;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
Expand Down Expand Up @@ -357,38 +358,89 @@ private void validateInputAnsi(CharsetDecoder decoder, ByteBuffer inBuf, CharBuf
outBuf.flip();
final String textAnsi = outBuf.toString();

// performing detection of utf-8 characters in ansi byte code
inBuf.flip();
outBuf.limit(outBuf.capacity());
outBuf.position(0);
final CharsetDecoder decoderUtf8 = StandardCharsets.UTF_8.newDecoder();
final CoderResult cr = decoderUtf8.decode(inBuf, outBuf, true);
if (!cr.isError()) {
outBuf.flip();
final String textUtf8 = outBuf.toString();

boolean isError = false;
for (int ofs = 0, len1 = textAnsi.length(), len2 = textUtf8.length(); ofs < len1 && ofs < len2
&& !isError; ofs++) {
final char ch1 = textAnsi.charAt(ofs);
final char ch2 = textUtf8.charAt(ofs);
if (ch1 != ch2) {
synchronized(this) {
final byte[] buffer = Arrays.copyOf(inBuf.array(), inBuf.limit());
table.addTableItem(new StringErrorTableItem(entry, strref, textAnsi,
new StringError(isFemale, strref, buffer, ofs, 1, "encoding error")));
isError = true;
}
}
final byte[] buffer = Arrays.copyOf(inBuf.array(), inBuf.limit());
int offset = 0;
while (offset < buffer.length) {
final StringError se = validateUtf8Input(buffer, offset, buffer.length - offset, isFemale, strref);
if (se == null) {
break;
}
table.addTableItem(new StringErrorTableItem(entry, strref, textAnsi, se));
offset = se.getOffset() + se.getLength();
}
}

if (!isError && textAnsi.length() > textUtf8.length()) {
synchronized (this) {
final byte[] buffer = Arrays.copyOf(inBuf.array(), inBuf.limit());
table.addTableItem(new StringErrorTableItem(entry, strref, textAnsi,
new StringError(isFemale, strref, buffer, textUtf8.length(), 1, "encoding error")));
/**
* Analyzes the specified buffer for multi-byte UTF-8 characters and returns them as {@link StringError} objects.
*
* @param buffer Byte buffer with raw text data.
* @param offset Start offset for the analyzation process.
* @param len Max. number of bytes to analyze.
* @return An initialized {@link StringError} if a multi-byte UTF-8 character was found, {@code null} otherwise.
*/
private StringError validateUtf8Input(byte[] buffer, int offset, int len, boolean isFemale, int strref) {
if (buffer == null || offset < 0 || len <= 0 || offset + len > buffer.length) {
return null;
}

// Limiting max. number of bytes per utf-8 code to the specified ANSI code page
// to reduce detection of false positives.
final int maxBytes = CharsetDetector.getMaxAnsiUtf8Length(StringTable.getCharset());

final int length = offset + len;
for (int i = offset; i < length; i++) {
final int b1 = buffer[i] & 0xff;
if (b1 >= 0x80) {
switch (b1) {
case 0xc0:
case 0xc1:
case 0xf5:
case 0xf6:
case 0xf7:
case 0xf8:
case 0xf9:
case 0xfa:
case 0xfb:
case 0xfc:
case 0xfd:
case 0xfe:
case 0xff:
// not a legal UTF-8 code
continue;
}

if (maxBytes > 1 && (b1 & 0xe0) == 0xc0 && i + 1 < length) {
// two bytes
final int b2 = buffer[i + 1] & 0xff;
if ((b2 & 0xc0) == 0x80) {
return new StringError(isFemale, strref, buffer, i, 2, "encoding error");
}
} else if (maxBytes > 2 && (b1 & 0xf0) == 0xe0 && i + 2 < length) {
// three bytes
final int b2 = buffer[i + 1] & 0xff;
final int b3 = buffer[i + 2] & 0xff;
if ((b2 & 0xc0) == 0x80 && (b3 & 0xc0) == 0x80) {
return new StringError(isFemale, strref, buffer, i, 3, "encoding error");
}
} else if (maxBytes > 3 && (b1 & 0xf8) == 0xf0 && i + 3 < length) {
// four bytes
final int b2 = buffer[i + 1] & 0xff;
final int b3 = buffer[i + 2] & 0xff;
final int b4 = buffer[i + 3] & 0xff;
if ((b2 & 0xc0) == 0x80 && (b3 & 0xc0) == 0x80 && (b4 & 0xc0) == 0x80) {
final int codePoint = ((b1 & 0x07) << 18) | ((b2 & 0x3f) << 12) | ((b3 & 0x3f) << 6) | (b4 & 0x3f);
if (codePoint < 0x110000) {
return new StringError(isFemale, strref, buffer, i, 4, "encoding error");
}
}
}
// else assume legal ANSI character
}
}

return null;
}

/** Executes the repair operation as a background task with UI feedback. */
Expand Down Expand Up @@ -474,7 +526,7 @@ private Couple<Integer, Integer> repairEntries(SwingWorker<?,?> worker) {
StringTable.write(null);
}

retVal = new Couple<>(numRemoved, numRemoved);
retVal = new Couple<>(numReplaced, numRemoved);
}

return retVal;
Expand Down Expand Up @@ -552,7 +604,10 @@ public void actionPerformed(ActionEvent e) {

// perform repair operation
final int count = table.getModel().getRowCount();
final String msg = (count > 1) ? "Repair all " + count + " entries?" : "Repair " + count + " entry?";
String msg = (count > 1) ? "Repair all " + count + " issues?" : "Repair " + count + " issue?";
if (!Profile.isEnhancedEdition()) {
msg += "\nCaution: Operation may be inaccurate for some languages.";
}
final int result = JOptionPane.showConfirmDialog(getResultFrame(), msg, "Question", JOptionPane.YES_NO_OPTION,
JOptionPane.QUESTION_MESSAGE);
if (result == JOptionPane.YES_OPTION) {
Expand Down Expand Up @@ -733,14 +788,74 @@ public String getRepaired() {
}
} else {
// Fixing ANSI/multi-byte charset
final CharsetDecoder csd = StandardCharsets.UTF_8.newDecoder();
csd.onMalformedInput(CodingErrorAction.REPORT);
csd.onUnmappableCharacter(CodingErrorAction.REPORT);
final int maxCharLength = (int) Math.ceil(csd.maxCharsPerByte() * data.length);
final CharBuffer outBuf = CharBuffer.allocate(maxCharLength);
final CoderResult cr = csd.decode(ByteBuffer.wrap(data), outBuf, true);
if (!cr.isError() && !cr.isUnderflow()) {
retVal = outBuf.flip().toString();
if (data.length > 0) {
// attempting to find a replacement string
int ofs = 0;
while (ofs < data.length) {
int codePoint = 0;
final int b1 = data[ofs] & 0xff;
if ((b1 & 0xe0) == 0xc0 && ofs + 1 < data.length) {
// two bytes
final int b2 = data[ofs + 1] & 0xff;
if ((b2 & 0xc0) == 0x80) {
codePoint = ((b1 & 0x1f) << 6) | (b2 & 0x3f);
}
ofs += 2;
} else if ((b1 & 0xf0) == 0xe0 && ofs + 2 < data.length) {
// three bytes
final int b2 = data[ofs + 1] & 0xff;
final int b3 = data[ofs + 2] & 0xff;
if ((b2 & 0xc0) == 0x80 && (b3 & 0xc0) == 0x80) {
codePoint = ((b1 & 0x0f) << 12) | ((b2 & 0x3f) << 6) | (b3 & 0x3f);
}
ofs += 3;
} else if ((b1 & 0xf8) == 0xf0 && ofs + 3 < data.length) {
// four bytes
final int b2 = data[ofs + 1] & 0xff;
final int b3 = data[ofs + 2] & 0xff;
final int b4 = data[ofs + 3] & 0xff;
if ((b2 & 0xc0) == 0x80 && (b3 & 0xc0) == 0x80 && (b4 & 0xc0) == 0x80) {
codePoint = ((b1 & 0x07) << 18) | ((b2 & 0x3f) << 12) | ((b3 & 0x3f) << 6) | (b4 & 0x3f);
}
ofs += 4;
}

if (codePoint > 0) {
// Performing thorough analysis on byte data
boolean isUtf = true;
try {
final char[] chars = Character.toChars(codePoint);
final CharsetEncoder cse = StringTable.getCharset().newEncoder();
cse.onMalformedInput(CodingErrorAction.REPORT);
cse.onUnmappableCharacter(CodingErrorAction.REPORT);
final CoderResult ecr = cse.encode(CharBuffer.wrap(chars), ByteBuffer.allocate(chars.length * 4), true);
if (!ecr.isError()) {
// Add only if utf-8 code point defines a valid character in the local charset of the string table
retVal += new String(chars);
} else {
isUtf = false;
}
} catch (IllegalArgumentException e) {
// not a valid Unicode code point
isUtf = false;
}

if (!isUtf) {
// Test if raw bytes already defined valid characters in the local charset of the string table
final CharsetDecoder csd = StringTable.getCharset().newDecoder();
csd.onMalformedInput(CodingErrorAction.REPORT);
csd.onUnmappableCharacter(CodingErrorAction.REPORT);
final ByteBuffer bb = ByteBuffer.wrap(data);
final CharBuffer cb = CharBuffer.allocate(data.length * 2);
final CoderResult dcr = csd.decode(bb, cb, true);
if (!dcr.isError()) {
// Restoring original content
cb.flip();
retVal += cb.toString();
}
}
}
}
}
}

Expand Down
21 changes: 20 additions & 1 deletion src/org/infinity/gui/StringEditor.java
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,15 @@ public StringEditor() {
}

/**
* Saves any changes that were made to the string table to disk.
* Performs the chosen operation on the string table.
* <p>
* In non-interactive mode it will execute the code specified by the {@code saveOperation} parameter.
* </p>
* <p>
* In interactive mode it depends on the user choice: {@code YES} executes the code specified by the
* {@code saveOperation} parameter. {@code NO} resets the modified content of the string table. {@code CANCEL} skips
* all operations.
* </p>
*
* @param saveOperation {@code Operation} that performs the actual string table save operation. If {@code null} is
* specified then a simple {@link StringTable#write(ProgressCallback)} is performed.
Expand All @@ -137,6 +145,7 @@ public static boolean saveModified(Operation saveOperation, boolean interactive,
boolean retVal = true;
if (StringTable.isModified()) {
boolean shouldSave = true;
boolean shouldClear = false;

if (interactive) {
int optionType = forced ? JOptionPane.YES_NO_OPTION : JOptionPane.YES_NO_CANCEL_OPTION;
Expand All @@ -146,10 +155,12 @@ public static boolean saveModified(Operation saveOperation, boolean interactive,
int result = JOptionPane.showConfirmDialog(parent, "String table has been modified. Save changes to disk?",
"Save changes", optionType, JOptionPane.QUESTION_MESSAGE);
shouldSave = (result == JOptionPane.YES_OPTION);
shouldClear = (result == JOptionPane.NO_OPTION);
retVal = (result != JOptionPane.CANCEL_OPTION);
}

if (shouldSave) {
// performing specified save operation
final Operation op = (saveOperation != null) ? saveOperation : () -> StringTable.write(null);
final RootPaneContainer pane =
(parent instanceof RootPaneContainer) ? (RootPaneContainer)parent : NearInfinity.getInstance();
Expand All @@ -174,6 +185,14 @@ protected Void doInBackground() throws Exception {
Logger.error(e);
}
}

if (shouldClear) {
// removing "modified" flag from string table
StringTable.resetModified(StringTable.Type.MALE);
if (StringTable.hasFemaleTable()) {
StringTable.resetModified(StringTable.Type.FEMALE);
}
}
}

return retVal;
Expand Down
50 changes: 50 additions & 0 deletions src/org/infinity/util/CharsetDetector.java
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,56 @@ public static String getDefaultCharset(String langCode) {
return retVal;
}

/**
* Returns the max. number of bytes required to encode a character supported by the specified ANSI charset mapped to
* UTF-8.
*
* @param charset ANSI codepage as {@code Charset} object.
* @return Number of required bytes which should range from 1 to 4.
*/
public static int getMaxAnsiUtf8Length(Charset charset) {
int retVal = 4;
if (charset != null) {
if (charsetIs(charset, "us-ascii")) {
retVal = 1;
} else if (charsetIs(charset, "windows-1252")) {
retVal = 2;
} else if (charsetIs(charset, "windows-1250")) {
retVal = 2;
} else if (charsetIs(charset, "windows-1251")) {
retVal = 2;
} else if (charsetIs(charset, "windows-1254")) {
retVal = 2;
} else if (charsetIs(charset, "windows-31j")) {
retVal = 3;
} else if (charsetIs(charset, "ibm-949")) {
retVal = 3;
} else if (charsetIs(charset, "gbk")) {
retVal = 3;
}
}
return retVal;
}

/**
* Returns whether the name of the specified charset matches the search name.
*
* @param charset {@link Charset} to check.
* @param name Charset name.
* @return {@code true} if {@code name} matches the name or any of the name aliases of the specified {@code charset}.
* Returns {@code false} otherwise.
*/
private static boolean charsetIs(Charset charset, String name) {
boolean retVal = false;
if (charset != null && name != null) {
retVal = charset.name().equalsIgnoreCase(name);
if (!retVal) {
retVal = charset.aliases().stream().anyMatch(n -> n.equalsIgnoreCase(name));
}
}
return retVal;
}

//-------------------------- INNER CLASSES --------------------------

// Handles character decoding and encoding
Expand Down

0 comments on commit 0c96cd6

Please sign in to comment.