diff --git a/site/source/docs/tools_reference/settings_reference.rst b/site/source/docs/tools_reference/settings_reference.rst index e71191a2961cf..bdf4296c53948 100644 --- a/site/source/docs/tools_reference/settings_reference.rst +++ b/site/source/docs/tools_reference/settings_reference.rst @@ -2857,6 +2857,18 @@ then you can safely ignore this warning. Default value: false +.. _single_file_binary_encode: + +SINGLE_FILE_BINARY_ENCODE +========================= + +If true, binary Wasm content is encoded using a custom UTF-8 embedding +instead of base64. This generates smaller binary. +Set this to false to revert back to earlier base64 encoding if you run into +issues with the binary encoding. (and please let us know of any such issues) + +Default value: true + .. _auto_js_libraries: AUTO_JS_LIBRARIES diff --git a/src/binaryDecode.js b/src/binaryDecode.js new file mode 100644 index 0000000000000..a35a737781998 --- /dev/null +++ b/src/binaryDecode.js @@ -0,0 +1,10 @@ +// Prevent Closure from minifying the binaryDecode() function, or otherwise +// Closure may analyze through the WASM_BINARY_DATA placeholder string into this +// function, leading into incorrect results. +/** @noinline */ +function binaryDecode(bin) { + for(var i = 0, l = bin.length, o = new Uint8Array(l); i < l; ++i) { + o[i] = bin.charCodeAt(i) - 1; + } + return o; +} diff --git a/src/preamble.js b/src/preamble.js index dd1429ed52f19..62debfd6935e0 100644 --- a/src/preamble.js +++ b/src/preamble.js @@ -603,6 +603,10 @@ function instrumentWasmTableWithAbort() { } #endif +#if SINGLE_FILE && SINGLE_FILE_BINARY_ENCODE && !WASM2JS +#include "binaryDecode.js" +#endif + function findWasmBinary() { #if EXPORT_ES6 && USE_ES6_IMPORT_META && !SINGLE_FILE && !AUDIO_WORKLET if (Module['locateFile']) { @@ -613,7 +617,13 @@ function findWasmBinary() { return locateFile(f); } #endif + +#if SINGLE_FILE && SINGLE_FILE_BINARY_ENCODE && !WASM2JS + return binaryDecode(f); +#else return f; +#endif + #if EXPORT_ES6 && USE_ES6_IMPORT_META && !SINGLE_FILE && !AUDIO_WORKLET // In single-file mode, repeating WASM_BINARY_FILE would emit the contents again. For an Audio Worklet, we cannot use `new URL()`. } #if ENVIRONMENT_MAY_BE_SHELL @@ -628,6 +638,9 @@ function findWasmBinary() { var wasmBinaryFile; function getBinarySync(file) { +#if SINGLE_FILE && SINGLE_FILE_BINARY_ENCODE + return file; +#else if (file == wasmBinaryFile && wasmBinary) { return new Uint8Array(wasmBinary); } @@ -645,6 +658,7 @@ function getBinarySync(file) { #else throw 'sync fetching of the wasm failed: you can preload it to Module["wasmBinary"] manually, or emcc.py will do that for you when generating HTML (but not JS)'; #endif +#endif } function getBinaryPromise(binaryFile) { diff --git a/src/preamble_minimal.js b/src/preamble_minimal.js index f110870d2b77e..a48696fca857e 100644 --- a/src/preamble_minimal.js +++ b/src/preamble_minimal.js @@ -45,10 +45,17 @@ if (Module['doWasm2JS']) { #endif #if SINGLE_FILE && WASM == 1 && !WASM2JS + +#if SINGLE_FILE_BINARY_ENCODE +#include "binaryDecode.js" +Module['wasm'] = binaryDecode('<<< WASM_BINARY_DATA >>>'); +#else #include "base64Decode.js" Module['wasm'] = base64Decode('<<< WASM_BINARY_DATA >>>'); #endif +#endif + var HEAP8, HEAP16, HEAP32, HEAPU8, HEAPU16, HEAPU32, HEAPF32, HEAPF64, #if WASM_BIGINT HEAP64, HEAPU64, diff --git a/src/proxyClient.js b/src/proxyClient.js index 95dd1d094d633..c8272f55a2cdc 100644 --- a/src/proxyClient.js +++ b/src/proxyClient.js @@ -129,6 +129,10 @@ var SUPPORT_BASE64_EMBEDDING; var filename; filename ||= '<<< filename >>>'; +#if SINGLE_FILE && SINGLE_FILE_BINARY_ENCODE +#include "binaryDecode.js" +var workerURL = URL.createObjectURL(new Blob([binaryDecode(filename)], {type: 'application/javascript'})); +#else var workerURL = filename; if (SUPPORT_BASE64_EMBEDDING) { var fileBytes = tryParseAsDataURI(filename); @@ -136,6 +140,7 @@ if (SUPPORT_BASE64_EMBEDDING) { workerURL = URL.createObjectURL(new Blob([fileBytes], {type: 'application/javascript'})); } } +#endif var worker = new Worker(workerURL); #if ENVIRONMENT_MAY_BE_NODE @@ -166,7 +171,11 @@ worker.onmessage = (event) => { if (!workerResponded) { workerResponded = true; Module.setStatus?.(''); +#if SINGLE_FILE && SINGLE_FILE_BINARY_ENCODE + URL.revokeObjectURL(workerURL); +#else if (SUPPORT_BASE64_EMBEDDING && workerURL !== filename) URL.revokeObjectURL(workerURL); +#endif } var data = event.data; diff --git a/src/settings.js b/src/settings.js index 1c69c2ff0272e..5cba724bbf687 100644 --- a/src/settings.js +++ b/src/settings.js @@ -1859,6 +1859,12 @@ var WASMFS = false; // [link] var SINGLE_FILE = false; +// If true, binary Wasm content is encoded using a custom UTF-8 embedding +// instead of base64. This generates smaller binary. +// Set this to false to revert back to earlier base64 encoding if you run into +// issues with the binary encoding. (and please let us know of any such issues) +var SINGLE_FILE_BINARY_ENCODE = true; + // If set to 1, all JS libraries will be automatically available at link time. // This gets set to 0 in STRICT mode (or with MINIMAL_RUNTIME) which mean you // need to explicitly specify -lfoo.js in at link time in order to access diff --git a/test/code_size/embind_hello_wasm.json b/test/code_size/embind_hello_wasm.json index c64f5a38f2551..c2e009eb3b271 100644 --- a/test/code_size/embind_hello_wasm.json +++ b/test/code_size/embind_hello_wasm.json @@ -4,7 +4,7 @@ "a.js": 9920, "a.js.gz": 4354, "a.wasm": 7715, - "a.wasm.gz": 3512, + "a.wasm.gz": 3508, "total": 18187, - "total_gz": 8246 + "total_gz": 8242 } diff --git a/test/code_size/hello_webgl2_wasm_singlefile_wasm.json b/test/code_size/hello_webgl2_wasm_singlefile_wasm.json new file mode 100644 index 0000000000000..48a266fa0fe69 --- /dev/null +++ b/test/code_size/hello_webgl2_wasm_singlefile_wasm.json @@ -0,0 +1,6 @@ +{ + "a.html": 17586, + "a.html.gz": 10152, + "total": 17586, + "total_gz": 10152 +} diff --git a/test/code_size/math_wasm.json b/test/code_size/math_wasm.json index cefd1f1d7477e..5328e0087d307 100644 --- a/test/code_size/math_wasm.json +++ b/test/code_size/math_wasm.json @@ -4,7 +4,7 @@ "a.js": 110, "a.js.gz": 125, "a.wasm": 2719, - "a.wasm.gz": 1674, + "a.wasm.gz": 1673, "total": 3381, - "total_gz": 2179 + "total_gz": 2178 } diff --git a/test/code_size/random_printf_wasm.json b/test/code_size/random_printf_wasm.json index bcfa0afce45db..92a8465d26cda 100644 --- a/test/code_size/random_printf_wasm.json +++ b/test/code_size/random_printf_wasm.json @@ -1,6 +1,6 @@ { - "a.html": 12690, - "a.html.gz": 6857, - "total": 12690, - "total_gz": 6857 + "a.html": 11058, + "a.html.gz": 5724, + "total": 11058, + "total_gz": 5724 } diff --git a/test/code_size/random_printf_wasm2js.json b/test/code_size/random_printf_wasm2js.json index 1074f7b62fb46..09b30713f23f0 100644 --- a/test/code_size/random_printf_wasm2js.json +++ b/test/code_size/random_printf_wasm2js.json @@ -1,6 +1,6 @@ { "a.html": 17277, - "a.html.gz": 7489, + "a.html.gz": 7486, "total": 17277, - "total_gz": 7489 + "total_gz": 7486 } diff --git a/test/other/codesize/test_codesize_files_wasmfs.size b/test/other/codesize/test_codesize_files_wasmfs.size index 30ae7b0fa53cb..c0c6e320481bb 100644 --- a/test/other/codesize/test_codesize_files_wasmfs.size +++ b/test/other/codesize/test_codesize_files_wasmfs.size @@ -1 +1 @@ -50948 +50942 diff --git a/test/test_browser.py b/test/test_browser.py index 4b79abbab5093..145d42ea2c368 100644 --- a/test/test_browser.py +++ b/test/test_browser.py @@ -3270,10 +3270,12 @@ def test_modularize(self, opts): # this test is synchronous, so avoid async startup due to wasm features self.compile_btest('browser_test_hello_world.c', ['-sMODULARIZE', '-sSINGLE_FILE'] + args + opts) create_file('a.html', ''' + + ''' % code) self.run_browser('a.html', '/report_result?0') @@ -4667,7 +4669,7 @@ def test_single_file_locate_file(self): # Tests that SINGLE_FILE works as intended in a Worker in JS output def test_single_file_worker_js(self): self.compile_btest('browser_test_hello_world.c', ['-o', 'test.js', '--proxy-to-worker', '-sSINGLE_FILE']) - create_file('test.html', '') + create_file('test.html', '') self.run_browser('test.html', '/report_result?0') self.assertExists('test.js') self.assertNotExists('test.worker.js') diff --git a/test/test_other.py b/test/test_other.py index 7cb6d54d3eace..57eda049e33a2 100644 --- a/test/test_other.py +++ b/test/test_other.py @@ -8536,7 +8536,7 @@ def test_unoptimized_code_size(self): # We don't care too about unoptimized code size but we would like to keep it # under control to a certain extent. This test allows us to track major # changes to the size of the unoptimized and unminified code size. - # Run with `--rebase` when this test fails. + # Run with `--rebaseline` when this test fails. self.build(test_file('hello_world.c'), emcc_args=['-O0', '--output_eol=linux']) self.check_expected_size_in_file('wasm', test_file('other/test_unoptimized_code_size.wasm.size'), @@ -9308,8 +9308,9 @@ def test_standalone_system_headers(self): @is_slow_test @parameterized({ - '': (True,), - 'disabled': (False,), + '': (1,), + 'disabled': (0,), + 'binary_encode': (2,), }) @also_with_wasm2js def test_single_file(self, single_file_enabled): @@ -9327,6 +9328,8 @@ def test_single_file(self, single_file_enabled): else: expect_wasm = self.is_wasm() + cmd += [f'-sSINGLE_FILE_BINARY_ENCODE={int(single_file_enabled == 2)}'] + if debug_enabled: cmd += ['-g'] if closure_enabled: @@ -10917,6 +10920,7 @@ def test_function_exports_are_small(self, args, opt, closure): 'random_printf_wasm2js': ('random_printf', True), 'hello_webgl_wasm': ('hello_webgl', False), 'hello_webgl_wasm2js': ('hello_webgl', True), + 'hello_webgl2_wasm_singlefile': ('hello_webgl2_wasm_singlefile', False), 'hello_webgl2_wasm': ('hello_webgl2', False), 'hello_webgl2_wasm2js': ('hello_webgl2', True), 'math': ('math', False), @@ -10963,6 +10967,7 @@ def test_minimal_runtime_code_size(self, test_name, js, compare_js_output=False) '-lGL', '-sMODULARIZE'] hello_webgl2_sources = hello_webgl_sources + ['-sMAX_WEBGL_VERSION=2'] + hello_webgl2_wasm_singlefile_sources = hello_webgl2_sources + ['-sSINGLE_FILE'] hello_wasm_worker_sources = [test_file('wasm_worker/wasm_worker_code_size.c'), '-sWASM_WORKERS', '-sENVIRONMENT=web,worker'] embind_hello_sources = [test_file('code_size/embind_hello_world.cpp'), '-lembind'] embind_val_sources = [test_file('code_size/embind_val_hello_world.cpp'), @@ -10977,6 +10982,7 @@ def test_minimal_runtime_code_size(self, test_name, js, compare_js_output=False) 'hello_webgl': hello_webgl_sources, 'math': math_sources, 'hello_webgl2': hello_webgl2_sources, + 'hello_webgl2_wasm_singlefile': hello_webgl2_wasm_singlefile_sources, 'hello_wasm_worker': hello_wasm_worker_sources, 'embind_val': embind_val_sources, 'embind_hello': embind_hello_sources, diff --git a/tools/building.py b/tools/building.py index 87ff61b23bfc6..eb3302a81dc0d 100644 --- a/tools/building.py +++ b/tools/building.py @@ -599,6 +599,8 @@ def closure_compiler(filename, advanced=True, extra_closure_args=None): args += ['--language_out', 'NO_TRANSPILE'] # Tell closure never to inject the 'use strict' directive. args += ['--emit_use_strict=false'] + # Always output UTF-8 files, this helps generate UTF-8 code points instead of escaping code points with \uxxxx inside strings. https://github.com/google/closure-compiler/issues/4158 + args += ['--charset=UTF8'] if settings.IGNORE_CLOSURE_COMPILER_ERRORS: args.append('--jscomp_off=*') @@ -649,7 +651,8 @@ def move_to_safe_7bit_ascii_filename(filename): # 7-bit ASCII range. Therefore make sure the command line we pass does not contain any such # input files by passing all input filenames relative to the cwd. (user temp directory might # be in user's home directory, and user's profile name might contain unicode characters) - proc = run_process(cmd, stderr=PIPE, check=False, env=env, cwd=tempfiles.tmpdir) + # https://github.com/google/closure-compiler/issues/4159: Closure outputs stdout/stderr in iso-8859-1 on Windows. + proc = run_process(cmd, stderr=PIPE, check=False, env=env, cwd=tempfiles.tmpdir, encoding='iso-8859-1' if WINDOWS else 'utf-8') # XXX Closure bug: if Closure is invoked with --create_source_map, Closure should create a # outfile.map source map file (https://github.com/google/closure-compiler/wiki/Source-Maps) diff --git a/tools/link.py b/tools/link.py index c2c8db0c58b3d..8f7fa664ba105 100644 --- a/tools/link.py +++ b/tools/link.py @@ -149,6 +149,10 @@ def base64_encode(b): return b64.decode('ascii') +def base64_or_binary_encode(b): + return binary_encode(b) if settings.SINGLE_FILE and settings.SINGLE_FILE_BINARY_ENCODE else base64_encode(b) + + def align_to_wasm_page_boundary(address): page_size = webassembly.WASM_PAGE_SIZE return ((address + (page_size - 1)) // page_size) * page_size @@ -2339,7 +2343,7 @@ def phase_binaryen(target, options, wasm_target): js = read_file(final_js) if settings.MINIMAL_RUNTIME: - js = do_replace(js, '<<< WASM_BINARY_DATA >>>', base64_encode(read_binary(wasm_target))) + js = do_replace(js, '<<< WASM_BINARY_DATA >>>', base64_or_binary_encode(read_binary(wasm_target))) else: js = do_replace(js, '<<< WASM_BINARY_FILE >>>', get_subresource_location(wasm_target)) delete_file(wasm_target) @@ -2981,11 +2985,45 @@ def move_file(src, dst): shutil.move(src, dst) +def binary_encode(data): + """This function encodes the given binary byte array to a UTF-8 string, by + first adding +1 to all the bytes [0, 255] to form values [1, 256], and then + encoding each of those values as UTF-8, except for specific byte values that + are escaped as two bytes. This kind of encoding results in a string that will + compress well by both gzip and brotli, unlike base64 encoding binary data + would do, and avoids emitting the null byte inside a string. + """ + + out = bytearray(len(data) * 2) # Size output buffer conservatively + i = 0 + for d in data: + d += 1 # Offset all bytes up by +1 to make zero (a very common value) be encoded with only one byte as 0x01. This is possible since we can encode 255 as 0x100 in UTF-8. + if d == ord("'"): + buf = [ord('\\'), d] # Escape single quote ' character with a backspace since we are writing a string inside single quotes. (' -> 2 bytes) + elif d == ord('"'): + buf = [ord('\\'), d] # Escape double quote " character with a backspace since optimizer may turn the string into being delimited with double quotes. (" -> 2 bytes) + elif d == ord('\r'): + buf = [ord('\\'), ord('r')] # Escape carriage return 0x0D as \r -> 2 bytes + elif d == ord('\n'): + buf = [ord('\\'), ord('n')] # Escape newline 0x0A as \n -> 2 bytes + elif d == ord('\\'): + buf = [ord('\\'), ord('\\')] # Escape backslash \ as \\ -> 2 bytes + else: + buf = chr(d).encode('utf-8') # Otherwise write the original value encoded in UTF-8 (1 or 2 bytes). + for b in buf: # Write the bytes to output buffer + out[i] = b + i += 1 + return out[0:i].decode('utf-8') # Crop output buffer to the actual used size + + # Returns the subresource location for run-time access def get_subresource_location(path): if settings.SINGLE_FILE: - data = base64.b64encode(utils.read_binary(path)) - return 'data:application/octet-stream;base64,' + data.decode('ascii') + if settings.SINGLE_FILE_BINARY_ENCODE: + return binary_encode(utils.read_binary(path)) + else: + data = base64.b64encode(utils.read_binary(path)) + return 'data:application/octet-stream;base64,' + data.decode('ascii') else: return os.path.basename(path)