Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@
import com.jme3.util.BufferUtils;
import com.jme3.util.IntMap;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.FloatBuffer;
import java.nio.IntBuffer;
import java.nio.ShortBuffer;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.HashMap;
Expand Down Expand Up @@ -1013,129 +1013,17 @@ protected boolean[][] readBooleanArray2D(byte[] content) throws IOException {
return value;
}

/*
* UTF-8 crash course:
*
* UTF-8 codepoints map to UTF-16 codepoints and vv, which is what Java uses for its Strings.
* (so a UTF-8 codepoint can contain all possible values for a Java char)
*
* A UTF-8 codepoint can be 1, 2 or 3 bytes long. How long a codepint is can be told by reading the first byte:
* b < 0x80, 1 byte
* (b & 0xC0) == 0xC0, 2 bytes
* (b & 0xE0) == 0xE0, 3 bytes
*
* However there is an additional restriction to UTF-8, to enable you to find the start of a UTF-8 codepoint,
* if you start reading at a random point in a UTF-8 byte stream. That's why UTF-8 requires for the second and third byte of
* a multibyte codepoint:
* (b & 0x80) == 0x80 (in other words, first bit must be 1)
*/
private final static int UTF8_START = 0; // next byte should be the start of a new
private final static int UTF8_2BYTE = 2; // next byte should be the second byte of a 2 byte codepoint
private final static int UTF8_3BYTE_1 = 3; // next byte should be the second byte of a 3 byte codepoint
private final static int UTF8_3BYTE_2 = 4; // next byte should be the third byte of a 3 byte codepoint
private final static int UTF8_ILLEGAL = 10; // not an UTF8 string

// String
protected String readString(byte[] content) throws IOException {
int length = readInt(content);
if (length == BinaryOutputCapsule.NULL_OBJECT)
return null;

/*
* @see ISSUE 276
*
* We'll transfer the bytes into a separate byte array.
* While we do that we'll take the opportunity to check if the byte data is valid UTF-8.
*
* If it is not UTF-8 it is most likely saved with the BinaryOutputCapsule bug, that saves Strings using their native
* encoding. Unfortunatly there is no way to know what encoding was used, so we'll parse using the most common one in
* that case; latin-1 aka ISO8859_1
*
* Encoding of "low" ASCII codepoint (in plain speak: when no special characters are used) will usually look the same
* for UTF-8 and the other 1 byte codepoint encodings (espc true for numbers and regular letters of the alphabet). So these
* are valid UTF-8 and will give the same result (at most a few charakters will appear different, such as the euro sign).
*
* However, when "high" codepoints are used (any codepoint that over 0x7F, in other words where the first bit is a 1) it's
* a different matter and UTF-8 and the 1 byte encoding greatly will differ, as well as most 1 byte encodings relative to each
* other.
*
* It is impossible to detect which one-byte encoding is used. Since UTF8 and practically all 1-byte encodings share the most
* used characters (the "none-high" ones) parsing them will give the same result. However, not all byte sequences are legal in
* UTF-8 (see explantion above). If not UTF-8 encoded content is detected we therefore fall back on latin1. We also log a warning.
*
* By this method we detect all use of 1 byte encoding if they:
* - use a "high" codepoint after a "low" codepoint or a sequence of codepoints that is valid as UTF-8 bytes, that starts with 1000
* - use a "low" codepoint after a "high" codepoint
* - use a "low" codepoint after "high" codepoint, after a "high" codepoint that starts with 1110
*
* In practise this means that unless 2 or 3 "high" codepoints are used after each other in proper order, we'll detect the string
* was not originally UTF-8 encoded.
*
*/
byte[] bytes = new byte[length];
int utf8State = UTF8_START;
int b;
for (int x = 0; x < length; x++) {
bytes[x] = content[index++];
b = (int) bytes[x] & 0xFF; // unsign our byte

switch (utf8State) {
case UTF8_START:
if (b < 0x80) {
// good
}
else if ((b & 0xC0) == 0xC0) {
utf8State = UTF8_2BYTE;
}
else if ((b & 0xE0) == 0xE0) {
utf8State = UTF8_3BYTE_1;
}
else {
utf8State = UTF8_ILLEGAL;
}
break;
case UTF8_3BYTE_1:
case UTF8_3BYTE_2:
case UTF8_2BYTE:
if ((b & 0x80) == 0x80)
utf8State = utf8State == UTF8_3BYTE_1 ? UTF8_3BYTE_2 : UTF8_START;
else
utf8State = UTF8_ILLEGAL;
break;
}
}

try {
// even though so far the parsing might have been a legal UTF-8 sequence, only if a codepoint is fully given is it correct UTF-8
if (utf8State == UTF8_START) {
// Java misspells UTF-8 as UTF8 for official use in java.lang
return new String(bytes, "UTF8");
}
else {
logger.log(
Level.WARNING,
"Your export has been saved with an incorrect encoding for its String fields which means it might not load correctly " +
"due to encoding issues. You should probably re-export your work. See ISSUE 276 in the jME issue tracker."
);
// We use ISO8859_1 to be consistent across platforms. We could default to native encoding, but this would lead to inconsistent
// behaviour across platforms!
// Developers that have previously saved their exports using the old exporter (which uses native encoding), can temporarly
// remove the ""ISO8859_1" parameter, and change the above if statement to "if (false)".
// They should then import and re-export their models using the same environment they were originally created in.
return new String(bytes, "ISO8859_1");
}
} catch (UnsupportedEncodingException uee) {
// as a last resort fall back to platform native.
// JavaDoc is vague about what happens when a decoding a String that contains un undecodable sequence
// it also doesn't specify which encodings have to be supported (though UTF-8 and ISO8859 have been in the SUN JRE since at least 1.1)
logger.log(
Level.SEVERE,
"Your export has been saved with an incorrect encoding or your version of Java is unable to decode the stored string. " +
"While your export may load correctly by falling back, using it on different platforms or java versions might lead to "+
"very strange inconsitenties. You should probably re-export your work. See ISSUE 276 in the jME issue tracker."
);
return new String(bytes);
}
return new String(bytes, StandardCharsets.UTF_8);
}

protected String[] readStringArray(byte[] content) throws IOException {
Expand Down Expand Up @@ -1418,4 +1306,4 @@ public <T extends Enum<T>> T readEnum(String name, Class<T> enumType, T defVal)
return null;
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import java.nio.FloatBuffer;
import java.nio.IntBuffer;
import java.nio.ShortBuffer;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
Expand Down Expand Up @@ -684,8 +685,7 @@ protected void write(String value) throws IOException {
write(NULL_OBJECT);
return;
}
// write our output as UTF-8. Java misspells UTF-8 as UTF8 for official use in java.lang
byte[] bytes = value.getBytes("UTF8");
byte[] bytes = value.getBytes(StandardCharsets.UTF_8);
write(bytes.length);
baos.write(bytes);
}
Expand Down