diff --git a/eslint.config.mjs b/eslint.config.mjs index fa52f28f45ce5..9cdd1157b2c66 100644 --- a/eslint.config.mjs +++ b/eslint.config.mjs @@ -27,6 +27,7 @@ export default [{ '**/test/', 'src/polyfill/', 'src/lib/', + 'src/binaryDecode.js', 'src/minimum_runtime_check.js', 'src/runtime_*.js', 'src/shell*.js', diff --git a/site/source/docs/tools_reference/settings_reference.rst b/site/source/docs/tools_reference/settings_reference.rst index 85f053734d054..c55d47022860a 100644 --- a/site/source/docs/tools_reference/settings_reference.rst +++ b/site/source/docs/tools_reference/settings_reference.rst @@ -2823,6 +2823,20 @@ then you can safely ignore this warning. Default value: false +.. _single_file_binary_encode: + +SINGLE_FILE_BINARY_ENCODE +========================= + +If true, binary Wasm content is encoded using a custom UTF-8 embedding +instead of base64. This generates a smaller binary that compresses well. +Set this to false to revert back to earlier base64 encoding if you run into +issues with the binary encoding. (and please let us know of any such issues) +If no issues arise, this option will permanently become the default in the +future. + +Default value: true + .. _auto_js_libraries: AUTO_JS_LIBRARIES diff --git a/src/binaryDecode.js b/src/binaryDecode.js new file mode 100644 index 0000000000000..5706354a5a85b --- /dev/null +++ b/src/binaryDecode.js @@ -0,0 +1,10 @@ +// Prevent Closure from minifying the binaryDecode() function, or otherwise +// Closure may analyze through the WASM_BINARY_DATA placeholder string into this +// function, leading into incorrect results. +/** @noinline */ +function binaryDecode(bin) { + for (var i = 0, l = bin.length, o = new Uint8Array(l); i < l; ++i) { + o[i] = bin.charCodeAt(i) - 1; + } + return o; +} diff --git a/src/postamble_minimal.js b/src/postamble_minimal.js index 8a76698156e0c..9472ec752e9f8 100644 --- a/src/postamble_minimal.js +++ b/src/postamble_minimal.js @@ -149,7 +149,9 @@ function initRuntime(wasmExports) { // Initialize wasm (asynchronous) -#if SINGLE_FILE && WASM == 1 && !WASM2JS +#if SINGLE_FILE && SINGLE_FILE_BINARY_ENCODE && !WASM2JS +Module['wasm'] = binaryDecode("<<< WASM_BINARY_DATA >>>"); +#elif SINGLE_FILE && WASM == 1 && !WASM2JS Module['wasm'] = base64Decode('<<< WASM_BINARY_DATA >>>'); #endif diff --git a/src/preamble.js b/src/preamble.js index bd6aec1cf9924..c9002009f05d2 100644 --- a/src/preamble.js +++ b/src/preamble.js @@ -423,27 +423,34 @@ function getWasmBinary(file) {} #else function findWasmBinary() { -#if SINGLE_FILE +#if SINGLE_FILE && SINGLE_FILE_BINARY_ENCODE && !WASM2JS + return binaryDecode("<<< WASM_BINARY_DATA >>>"); +#elif SINGLE_FILE return base64Decode('<<< WASM_BINARY_DATA >>>'); +#elif AUDIO_WORKLET || !EXPORT_ES6 // For an Audio Worklet, we cannot use `new URL()`. + return locateFile('{{{ WASM_BINARY_FILE }}}'); #else -#if EXPORT_ES6 && !AUDIO_WORKLET - if (Module['locateFile']) { -#endif - return locateFile('{{{ WASM_BINARY_FILE }}}'); -#if EXPORT_ES6 && !AUDIO_WORKLET // For an Audio Worklet, we cannot use `new URL()`. - } + #if ENVIRONMENT_MAY_BE_SHELL if (ENVIRONMENT_IS_SHELL) { return '{{{ WASM_BINARY_FILE }}}'; } #endif + + if (Module['locateFile']) { + return locateFile('{{{ WASM_BINARY_FILE }}}'); + } + // Use bundler-friendly `new URL(..., import.meta.url)` pattern; works in browsers too. return new URL('{{{ WASM_BINARY_FILE }}}', import.meta.url).href; -#endif + #endif } function getBinarySync(file) { +#if SINGLE_FILE && SINGLE_FILE_BINARY_ENCODE + return file; +#else #if SINGLE_FILE if (ArrayBuffer.isView(file)) { return file; @@ -464,6 +471,7 @@ function getBinarySync(file) { #else throw 'sync fetching of the wasm failed: you can preload it to Module["wasmBinary"] manually, or emcc.py will do that for you when generating HTML (but not JS)'; #endif +#endif } async function getWasmBinary(binaryFile) { diff --git a/src/proxyClient.js b/src/proxyClient.js index 0a3049e2c4f7e..3d14a4ba06960 100644 --- a/src/proxyClient.js +++ b/src/proxyClient.js @@ -132,7 +132,18 @@ var frameId = 0; // Worker -var filename = '<<< filename >>>'; +var filename = "<<< filename >>>"; + +#if SINGLE_FILE && SINGLE_FILE_BINARY_ENCODE +#include "binaryDecode.js" + +#if ENVIRONMENT_MAY_BE_NODE +if (ENVIRONMENT_IS_NODE) filename = "data:text/javascript;base64," + Buffer.from(binaryDecode(filename)).toString('base64'); +else +#endif + filename = URL.createObjectURL(new Blob([binaryDecode(filename)], {type: 'application/javascript'})); + +#endif var worker = new Worker(filename); diff --git a/src/runtime_common.js b/src/runtime_common.js index 6110d611ab6e8..93523d4e12ecd 100644 --- a/src/runtime_common.js +++ b/src/runtime_common.js @@ -27,6 +27,10 @@ function growMemViews() { #include "runtime_asan.js" #endif +#if SINGLE_FILE && SINGLE_FILE_BINARY_ENCODE && !WASM2JS +#include "binaryDecode.js" +#endif + #if MODULARIZE var readyPromiseResolve, readyPromiseReject; #endif diff --git a/src/settings.js b/src/settings.js index 7624d4a25fdc1..a6019134fe63d 100644 --- a/src/settings.js +++ b/src/settings.js @@ -1853,6 +1853,15 @@ var WASMFS = false; // [link] var SINGLE_FILE = false; +// If true, binary Wasm content is encoded using a custom UTF-8 embedding +// instead of base64. This generates a smaller binary that compresses well. +// Set this to false to revert back to earlier base64 encoding if you run into +// issues with the binary encoding. (and please let us know of any such issues) +// If no issues arise, this option will permanently become the default in the +// future. +// [link] +var SINGLE_FILE_BINARY_ENCODE = true; + // If set to 1, all JS libraries will be automatically available at link time. // This gets set to 0 in STRICT mode (or with MINIMAL_RUNTIME) which mean you // need to explicitly specify -lfoo.js in at link time in order to access diff --git a/test/codesize/test_codesize_hello_single_file.json b/test/codesize/test_codesize_hello_single_file.json index b216a17b1a986..8ff0ea14ecf13 100644 --- a/test/codesize/test_codesize_hello_single_file.json +++ b/test/codesize/test_codesize_hello_single_file.json @@ -1,6 +1,6 @@ { - "a.out.js": 6497, - "a.out.js.gz": 3579, + "a.out.js": 5394, + "a.out.js.gz": 2992, "sent": [ "a (fd_write)" ] diff --git a/test/codesize/test_codesize_minimal_O0.expected.js b/test/codesize/test_codesize_minimal_O0.expected.js index 878c3f3358d36..16279522e87cb 100644 --- a/test/codesize/test_codesize_minimal_O0.expected.js +++ b/test/codesize/test_codesize_minimal_O0.expected.js @@ -585,7 +585,7 @@ function createExportWrapper(name, nargs) { var wasmBinaryFile; function findWasmBinary() { - return locateFile('a.out.wasm'); + return locateFile('a.out.wasm'); } function getBinarySync(file) { diff --git a/test/codesize/test_minimal_runtime_code_size_hello_webgl2_wasm_singlefile.json b/test/codesize/test_minimal_runtime_code_size_hello_webgl2_wasm_singlefile.json new file mode 100644 index 0000000000000..cc29e72f2cb69 --- /dev/null +++ b/test/codesize/test_minimal_runtime_code_size_hello_webgl2_wasm_singlefile.json @@ -0,0 +1,4 @@ +{ + "a.html": 15176, + "a.html.gz": 9126 +} diff --git a/test/codesize/test_minimal_runtime_code_size_random_printf_wasm.json b/test/codesize/test_minimal_runtime_code_size_random_printf_wasm.json index 3576098483a87..776ec42e8b6f9 100644 --- a/test/codesize/test_minimal_runtime_code_size_random_printf_wasm.json +++ b/test/codesize/test_minimal_runtime_code_size_random_printf_wasm.json @@ -1,4 +1,4 @@ { - "a.html": 12507, - "a.html.gz": 6823 + "a.html": 10998, + "a.html.gz": 5758 } diff --git a/test/codesize/test_unoptimized_code_size.json b/test/codesize/test_unoptimized_code_size.json index 7fce8dfb0d095..64aaf7fad2324 100644 --- a/test/codesize/test_unoptimized_code_size.json +++ b/test/codesize/test_unoptimized_code_size.json @@ -1,16 +1,16 @@ { - "hello_world.js": 56856, - "hello_world.js.gz": 17627, + "hello_world.js": 56854, + "hello_world.js.gz": 17626, "hello_world.wasm": 15119, "hello_world.wasm.gz": 7444, - "no_asserts.js": 26634, - "no_asserts.js.gz": 8883, + "no_asserts.js": 26632, + "no_asserts.js.gz": 8884, "no_asserts.wasm": 12219, "no_asserts.wasm.gz": 6005, - "strict.js": 54871, - "strict.js.gz": 16967, + "strict.js": 54869, + "strict.js.gz": 16968, "strict.wasm": 15119, "strict.wasm.gz": 7442, - "total": 180818, - "total_gz": 64368 + "total": 180812, + "total_gz": 64369 } diff --git a/test/test_codesize.py b/test/test_codesize.py index 7ca92e364236d..695c1d869f300 100644 --- a/test/test_codesize.py +++ b/test/test_codesize.py @@ -43,6 +43,7 @@ class codesize(RunnerCore): 'random_printf_wasm2js': ('random_printf', True), 'hello_webgl_wasm': ('hello_webgl', False), 'hello_webgl_wasm2js': ('hello_webgl', True), + 'hello_webgl2_wasm_singlefile': ('hello_webgl2_wasm_singlefile', False), 'hello_webgl2_wasm': ('hello_webgl2', False), 'hello_webgl2_wasm2js': ('hello_webgl2', True), 'math': ('math', False), @@ -87,6 +88,7 @@ def test_minimal_runtime_code_size(self, test_name, wasm2js, compare_js_output=F '-lGL', '-sMODULARIZE'] hello_webgl2_sources = hello_webgl_sources + ['-sMAX_WEBGL_VERSION=2'] + hello_webgl2_wasm_singlefile_sources = hello_webgl2_sources + ['-sSINGLE_FILE'] hello_wasm_worker_sources = [test_file('wasm_worker/wasm_worker_code_size.c'), '-sWASM_WORKERS', '-sENVIRONMENT=web'] audio_worklet_sources = [test_file('webaudio/audioworklet.c'), '-sWASM_WORKERS', '-sAUDIO_WORKLET', '-sENVIRONMENT=web', '-sTEXTDECODER=1'] embind_hello_sources = [test_file('codesize/embind_hello_world.cpp'), '-lembind'] @@ -98,6 +100,7 @@ def test_minimal_runtime_code_size(self, test_name, wasm2js, compare_js_output=F 'hello_webgl': hello_webgl_sources, 'math': math_sources, 'hello_webgl2': hello_webgl2_sources, + 'hello_webgl2_wasm_singlefile': hello_webgl2_wasm_singlefile_sources, 'hello_wasm_worker': hello_wasm_worker_sources, 'audio_worklet': audio_worklet_sources, 'embind_val': embind_val_sources, @@ -411,3 +414,13 @@ def test_codesize_file_preload(self): def test_small_js_flags(self): self.emcc('browser_test_hello_world.c', ['-O3', '--closure=1', '-sINCOMING_MODULE_JS_API=[]', '-sENVIRONMENT=web', '--output-eol=linux']) self.check_output_sizes('a.out.js') + + # This test verifies that gzipped binary-encoded a SINGLE_FILE build results in a smaller size + # than gzipped base64-encoded version. + def test_binary_encode_is_smaller_than_base64_encode(self): + self.emcc('hello_world.c', ['-O2', '-sSINGLE_FILE', '-sSINGLE_FILE_BINARY_ENCODE']) + size_binary_encode = len(gzip.compress(read_binary('a.out.js'))) + self.emcc('hello_world.c', ['-O2', '-sSINGLE_FILE', '-sSINGLE_FILE_BINARY_ENCODE=0']) + size_base64 = len(gzip.compress(read_binary('a.out.js'))) + print(f'Binary encoded file size: {size_binary_encode}, base64 encoded file size: {size_base64}') + self.assertLess(size_binary_encode, size_base64) diff --git a/test/test_other.py b/test/test_other.py index 5d1ea786469ff..52940388f0c1d 100644 --- a/test/test_other.py +++ b/test/test_other.py @@ -9470,16 +9470,18 @@ def test_standalone_system_headers(self, prefix): @is_slow_test @also_with_wasm2js + @also_with_minimal_runtime @parameterized({ '': (False, False), 'debug': (True, False), 'closure': (False, True), }) @parameterized({ - '': (True,), - 'disabled': (False,), + '': (True,False), + 'disabled': (False,False), + 'binary_encode': (True,True), }) - def test_single_file(self, debug_enabled, closure_enabled, single_file_enabled): + def test_single_file(self, debug_enabled, closure_enabled, single_file_enabled, single_file_binary_encoded): cmd = [EMCC, test_file('hello_world.c')] + self.get_cflags() if single_file_enabled: @@ -9488,6 +9490,8 @@ def test_single_file(self, debug_enabled, closure_enabled, single_file_enabled): else: expect_wasm = self.is_wasm() + cmd += [f'-sSINGLE_FILE_BINARY_ENCODE={int(single_file_binary_encoded)}'] + if debug_enabled: cmd += ['-g'] if closure_enabled: diff --git a/tools/link.py b/tools/link.py index 181adf252807a..542e7dbd052e4 100644 --- a/tools/link.py +++ b/tools/link.py @@ -163,6 +163,10 @@ def base64_encode(filename): return b64.decode('ascii') +def base64_or_binary_encode(b): + return binary_encode(b) if settings.SINGLE_FILE_BINARY_ENCODE else base64_encode(b) + + def align_to_wasm_page_boundary(address): page_size = webassembly.WASM_PAGE_SIZE return ((address + (page_size - 1)) // page_size) * page_size @@ -2435,7 +2439,7 @@ def phase_binaryen(target, options, wasm_target): if final_js and settings.SINGLE_FILE and not settings.WASM2JS: js = read_file(final_js) - js = do_replace(js, '<<< WASM_BINARY_DATA >>>', base64_encode(wasm_target)) + js = do_replace(js, '<<< WASM_BINARY_DATA >>>', base64_or_binary_encode(wasm_target)) delete_file(wasm_target) write_file(final_js, js) @@ -2945,9 +2949,44 @@ def move_file(src, dst): shutil.move(src, dst) +def binary_encode(filename): + """This function encodes the given binary byte array to a UTF-8 string, by + first adding +1 to all the bytes [0, 255] to form values [1, 256], and then + encoding each of those values as UTF-8, except for specific byte values that + are escaped as two bytes. This kind of encoding results in a string that will + compress well by both gzip and brotli, unlike base64 encoding binary data + would do, and avoids emitting the null byte inside a string. + """ + + data = utils.read_binary(filename) + + out = bytearray(len(data) * 2) # Size output buffer conservatively + i = 0 + for d in data: + d += 1 # Offset all bytes up by +1 to make zero (a very common value) be encoded with only one byte as 0x01. This is possible since we can encode 255 as 0x100 in UTF-8. + if d == ord('"'): + # Escape double quote " character with a backspace since we are writing the binary string inside double quotes. + # Also closure optimizer will turn the string into being delimited with double quotes, even if it were single quotes to start with. (" -> 2 bytes) + buf = [ord('\\'), d] + elif d == ord('\r'): + buf = [ord('\\'), ord('r')] # Escape carriage return 0x0D as \r -> 2 bytes + elif d == ord('\n'): + buf = [ord('\\'), ord('n')] # Escape newline 0x0A as \n -> 2 bytes + elif d == ord('\\'): + buf = [ord('\\'), ord('\\')] # Escape backslash \ as \\ -> 2 bytes + else: + buf = chr(d).encode('utf-8') # Otherwise write the original value encoded in UTF-8 (1 or 2 bytes). + for b in buf: # Write the bytes to output buffer + out[i] = b + i += 1 + return out[0:i].decode('utf-8') # Crop output buffer to the actual used size + + # Returns the subresource location for run-time access def get_subresource_location(path, mimetype='application/octet-stream'): if settings.SINGLE_FILE: + if settings.SINGLE_FILE_BINARY_ENCODE: + return binary_encode(path) return f'data:{mimetype};base64,{base64_encode(path)}' else: return os.path.basename(path)