diff --git a/jme3-core/src/plugins/java/com/jme3/export/binary/BinaryInputCapsule.java b/jme3-core/src/plugins/java/com/jme3/export/binary/BinaryInputCapsule.java index 5240c6d91d..4d68fa7326 100644 --- a/jme3-core/src/plugins/java/com/jme3/export/binary/BinaryInputCapsule.java +++ b/jme3-core/src/plugins/java/com/jme3/export/binary/BinaryInputCapsule.java @@ -37,11 +37,11 @@ import com.jme3.util.BufferUtils; import com.jme3.util.IntMap; import java.io.IOException; -import java.io.UnsupportedEncodingException; import java.nio.ByteBuffer; import java.nio.FloatBuffer; import java.nio.IntBuffer; import java.nio.ShortBuffer; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.BitSet; import java.util.HashMap; @@ -1013,129 +1013,17 @@ protected boolean[][] readBooleanArray2D(byte[] content) throws IOException { return value; } - /* - * UTF-8 crash course: - * - * UTF-8 codepoints map to UTF-16 codepoints and vv, which is what Java uses for its Strings. - * (so a UTF-8 codepoint can contain all possible values for a Java char) - * - * A UTF-8 codepoint can be 1, 2 or 3 bytes long. How long a codepint is can be told by reading the first byte: - * b < 0x80, 1 byte - * (b & 0xC0) == 0xC0, 2 bytes - * (b & 0xE0) == 0xE0, 3 bytes - * - * However there is an additional restriction to UTF-8, to enable you to find the start of a UTF-8 codepoint, - * if you start reading at a random point in a UTF-8 byte stream. That's why UTF-8 requires for the second and third byte of - * a multibyte codepoint: - * (b & 0x80) == 0x80 (in other words, first bit must be 1) - */ - private final static int UTF8_START = 0; // next byte should be the start of a new - private final static int UTF8_2BYTE = 2; // next byte should be the second byte of a 2 byte codepoint - private final static int UTF8_3BYTE_1 = 3; // next byte should be the second byte of a 3 byte codepoint - private final static int UTF8_3BYTE_2 = 4; // next byte should be the third byte of a 3 byte codepoint - private final static int UTF8_ILLEGAL = 10; // not an UTF8 string - - // String protected String readString(byte[] content) throws IOException { int length = readInt(content); if (length == BinaryOutputCapsule.NULL_OBJECT) return null; - /* - * @see ISSUE 276 - * - * We'll transfer the bytes into a separate byte array. - * While we do that we'll take the opportunity to check if the byte data is valid UTF-8. - * - * If it is not UTF-8 it is most likely saved with the BinaryOutputCapsule bug, that saves Strings using their native - * encoding. Unfortunatly there is no way to know what encoding was used, so we'll parse using the most common one in - * that case; latin-1 aka ISO8859_1 - * - * Encoding of "low" ASCII codepoint (in plain speak: when no special characters are used) will usually look the same - * for UTF-8 and the other 1 byte codepoint encodings (espc true for numbers and regular letters of the alphabet). So these - * are valid UTF-8 and will give the same result (at most a few charakters will appear different, such as the euro sign). - * - * However, when "high" codepoints are used (any codepoint that over 0x7F, in other words where the first bit is a 1) it's - * a different matter and UTF-8 and the 1 byte encoding greatly will differ, as well as most 1 byte encodings relative to each - * other. - * - * It is impossible to detect which one-byte encoding is used. Since UTF8 and practically all 1-byte encodings share the most - * used characters (the "none-high" ones) parsing them will give the same result. However, not all byte sequences are legal in - * UTF-8 (see explantion above). If not UTF-8 encoded content is detected we therefore fall back on latin1. We also log a warning. - * - * By this method we detect all use of 1 byte encoding if they: - * - use a "high" codepoint after a "low" codepoint or a sequence of codepoints that is valid as UTF-8 bytes, that starts with 1000 - * - use a "low" codepoint after a "high" codepoint - * - use a "low" codepoint after "high" codepoint, after a "high" codepoint that starts with 1110 - * - * In practise this means that unless 2 or 3 "high" codepoints are used after each other in proper order, we'll detect the string - * was not originally UTF-8 encoded. - * - */ byte[] bytes = new byte[length]; - int utf8State = UTF8_START; - int b; for (int x = 0; x < length; x++) { bytes[x] = content[index++]; - b = (int) bytes[x] & 0xFF; // unsign our byte - - switch (utf8State) { - case UTF8_START: - if (b < 0x80) { - // good - } - else if ((b & 0xC0) == 0xC0) { - utf8State = UTF8_2BYTE; - } - else if ((b & 0xE0) == 0xE0) { - utf8State = UTF8_3BYTE_1; - } - else { - utf8State = UTF8_ILLEGAL; - } - break; - case UTF8_3BYTE_1: - case UTF8_3BYTE_2: - case UTF8_2BYTE: - if ((b & 0x80) == 0x80) - utf8State = utf8State == UTF8_3BYTE_1 ? UTF8_3BYTE_2 : UTF8_START; - else - utf8State = UTF8_ILLEGAL; - break; - } } - try { - // even though so far the parsing might have been a legal UTF-8 sequence, only if a codepoint is fully given is it correct UTF-8 - if (utf8State == UTF8_START) { - // Java misspells UTF-8 as UTF8 for official use in java.lang - return new String(bytes, "UTF8"); - } - else { - logger.log( - Level.WARNING, - "Your export has been saved with an incorrect encoding for its String fields which means it might not load correctly " + - "due to encoding issues. You should probably re-export your work. See ISSUE 276 in the jME issue tracker." - ); - // We use ISO8859_1 to be consistent across platforms. We could default to native encoding, but this would lead to inconsistent - // behaviour across platforms! - // Developers that have previously saved their exports using the old exporter (which uses native encoding), can temporarly - // remove the ""ISO8859_1" parameter, and change the above if statement to "if (false)". - // They should then import and re-export their models using the same environment they were originally created in. - return new String(bytes, "ISO8859_1"); - } - } catch (UnsupportedEncodingException uee) { - // as a last resort fall back to platform native. - // JavaDoc is vague about what happens when a decoding a String that contains un undecodable sequence - // it also doesn't specify which encodings have to be supported (though UTF-8 and ISO8859 have been in the SUN JRE since at least 1.1) - logger.log( - Level.SEVERE, - "Your export has been saved with an incorrect encoding or your version of Java is unable to decode the stored string. " + - "While your export may load correctly by falling back, using it on different platforms or java versions might lead to "+ - "very strange inconsitenties. You should probably re-export your work. See ISSUE 276 in the jME issue tracker." - ); - return new String(bytes); - } + return new String(bytes, StandardCharsets.UTF_8); } protected String[] readStringArray(byte[] content) throws IOException { @@ -1418,4 +1306,4 @@ public > T readEnum(String name, Class enumType, T defVal) return null; } } -} \ No newline at end of file +} diff --git a/jme3-core/src/plugins/java/com/jme3/export/binary/BinaryOutputCapsule.java b/jme3-core/src/plugins/java/com/jme3/export/binary/BinaryOutputCapsule.java index 7119bb78c2..bc8c7e825c 100644 --- a/jme3-core/src/plugins/java/com/jme3/export/binary/BinaryOutputCapsule.java +++ b/jme3-core/src/plugins/java/com/jme3/export/binary/BinaryOutputCapsule.java @@ -41,6 +41,7 @@ import java.nio.FloatBuffer; import java.nio.IntBuffer; import java.nio.ShortBuffer; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.BitSet; @@ -684,8 +685,7 @@ protected void write(String value) throws IOException { write(NULL_OBJECT); return; } - // write our output as UTF-8. Java misspells UTF-8 as UTF8 for official use in java.lang - byte[] bytes = value.getBytes("UTF8"); + byte[] bytes = value.getBytes(StandardCharsets.UTF_8); write(bytes.length); baos.write(bytes); }