emscripten-core · juj · Mar 5, 2024 · Mar 5, 2024 · Mar 6, 2024 · Mar 6, 2024
diff --git a/site/source/docs/tools_reference/settings_reference.rst b/site/source/docs/tools_reference/settings_reference.rst
@@ -2857,6 +2857,18 @@ then you can safely ignore this warning.
 
 Default value: false
 
+.. _single_file_binary_encode:
+
+SINGLE_FILE_BINARY_ENCODE
+=========================
+
+If true, binary Wasm content is encoded using a custom UTF-8 embedding
+instead of base64. This generates smaller binary.
+Set this to false to revert back to earlier base64 encoding if you run into
+issues with the binary encoding. (and please let us know of any such issues)
+
+Default value: true
+
 .. _auto_js_libraries:
 
 AUTO_JS_LIBRARIES

diff --git a/src/binaryDecode.js b/src/binaryDecode.js
@@ -0,0 +1,10 @@
+// Prevent Closure from minifying the binaryDecode() function, or otherwise
+// Closure may analyze through the WASM_BINARY_DATA placeholder string into this
+// function, leading into incorrect results.
+/** @noinline */
+function binaryDecode(bin) {
+  for(var i = 0, l = bin.length, o = new Uint8Array(l); i < l; ++i) {
+    o[i] = bin.charCodeAt(i) - 1;
+  }
+  return o;
+}
diff --git a/src/preamble.js b/src/preamble.js
@@ -603,6 +603,10 @@ function instrumentWasmTableWithAbort() {
 }
 #endif
 
+#if SINGLE_FILE && SINGLE_FILE_BINARY_ENCODE && !WASM2JS
+#include "binaryDecode.js"
+#endif
+
 function findWasmBinary() {
 #if EXPORT_ES6 && USE_ES6_IMPORT_META && !SINGLE_FILE && !AUDIO_WORKLET
   if (Module['locateFile']) {
@@ -613,7 +617,13 @@ function findWasmBinary() {
       return locateFile(f);
     }
 #endif
+
+#if SINGLE_FILE && SINGLE_FILE_BINARY_ENCODE && !WASM2JS
+    return binaryDecode(f);
+#else
     return f;
+#endif
+
 #if EXPORT_ES6 && USE_ES6_IMPORT_META && !SINGLE_FILE && !AUDIO_WORKLET // In single-file mode, repeating WASM_BINARY_FILE would emit the contents again. For an Audio Worklet, we cannot use `new URL()`.
   }
 #if ENVIRONMENT_MAY_BE_SHELL
@@ -628,6 +638,9 @@ function findWasmBinary() {
 var wasmBinaryFile;
 
 function getBinarySync(file) {
+#if SINGLE_FILE && SINGLE_FILE_BINARY_ENCODE
+  return file;
+#else
   if (file == wasmBinaryFile && wasmBinary) {
     return new Uint8Array(wasmBinary);
   }
@@ -645,6 +658,7 @@ function getBinarySync(file) {
 #else
   throw 'sync fetching of the wasm failed: you can preload it to Module["wasmBinary"] manually, or emcc.py will do that for you when generating HTML (but not JS)';
 #endif
+#endif
 }
 
 function getBinaryPromise(binaryFile) {

diff --git a/src/preamble_minimal.js b/src/preamble_minimal.js
@@ -45,10 +45,17 @@ if (Module['doWasm2JS']) {
 #endif
 
 #if SINGLE_FILE && WASM == 1 && !WASM2JS
+
+#if SINGLE_FILE_BINARY_ENCODE
+#include "binaryDecode.js"
+Module['wasm'] = binaryDecode('<<< WASM_BINARY_DATA >>>');
+#else
 #include "base64Decode.js"
 Module['wasm'] = base64Decode('<<< WASM_BINARY_DATA >>>');
 #endif
 
+#endif
+
 var HEAP8, HEAP16, HEAP32, HEAPU8, HEAPU16, HEAPU32, HEAPF32, HEAPF64,
 #if WASM_BIGINT
   HEAP64, HEAPU64,

diff --git a/src/proxyClient.js b/src/proxyClient.js
@@ -129,13 +129,18 @@ var SUPPORT_BASE64_EMBEDDING;
 var filename;
 filename ||= '<<< filename >>>';
 
+#if SINGLE_FILE && SINGLE_FILE_BINARY_ENCODE
+#include "binaryDecode.js"
+var workerURL = URL.createObjectURL(new Blob([binaryDecode(filename)], {type: 'application/javascript'}));
+#else
 var workerURL = filename;
 if (SUPPORT_BASE64_EMBEDDING) {
   var fileBytes = tryParseAsDataURI(filename);
   if (fileBytes) {
     workerURL = URL.createObjectURL(new Blob([fileBytes], {type: 'application/javascript'}));
   }
 }
+#endif
 var worker = new Worker(workerURL);
 
 #if ENVIRONMENT_MAY_BE_NODE
@@ -166,7 +171,11 @@ worker.onmessage = (event) => {
   if (!workerResponded) {
     workerResponded = true;
     Module.setStatus?.('');
+#if SINGLE_FILE && SINGLE_FILE_BINARY_ENCODE
+    URL.revokeObjectURL(workerURL);
+#else
     if (SUPPORT_BASE64_EMBEDDING && workerURL !== filename) URL.revokeObjectURL(workerURL);
+#endif
   }
 
   var data = event.data;

diff --git a/src/settings.js b/src/settings.js
@@ -1859,6 +1859,12 @@ var WASMFS = false;
 // [link]
 var SINGLE_FILE = false;
 
+// If true, binary Wasm content is encoded using a custom UTF-8 embedding
+// instead of base64. This generates smaller binary.
+// Set this to false to revert back to earlier base64 encoding if you run into
+// issues with the binary encoding. (and please let us know of any such issues)
+var SINGLE_FILE_BINARY_ENCODE = true;
+
 // If set to 1, all JS libraries will be automatically available at link time.
 // This gets set to 0 in STRICT mode (or with MINIMAL_RUNTIME) which mean you
 // need to explicitly specify -lfoo.js in at link time in order to access

diff --git a/test/code_size/embind_hello_wasm.json b/test/code_size/embind_hello_wasm.json
@@ -4,7 +4,7 @@
   "a.js": 9920,
   "a.js.gz": 4354,
   "a.wasm": 7715,
-  "a.wasm.gz": 3512,
+  "a.wasm.gz": 3508,
   "total": 18187,
-  "total_gz": 8246
+  "total_gz": 8242
 }
diff --git a/test/code_size/hello_webgl2_wasm_singlefile_wasm.json b/test/code_size/hello_webgl2_wasm_singlefile_wasm.json
@@ -0,0 +1,6 @@
+{
+  "a.html": 17586,
+  "a.html.gz": 10152,
+  "total": 17586,
+  "total_gz": 10152
+}
diff --git a/test/code_size/math_wasm.json b/test/code_size/math_wasm.json
@@ -4,7 +4,7 @@
   "a.js": 110,
   "a.js.gz": 125,
   "a.wasm": 2719,
-  "a.wasm.gz": 1674,
+  "a.wasm.gz": 1673,
   "total": 3381,
-  "total_gz": 2179
+  "total_gz": 2178
 }
diff --git a/test/code_size/random_printf_wasm.json b/test/code_size/random_printf_wasm.json
@@ -1,6 +1,6 @@
 {
-  "a.html": 12690,
-  "a.html.gz": 6857,
-  "total": 12690,
-  "total_gz": 6857
+  "a.html": 11058,
+  "a.html.gz": 5724,
+  "total": 11058,
+  "total_gz": 5724
 }
diff --git a/test/code_size/random_printf_wasm2js.json b/test/code_size/random_printf_wasm2js.json
@@ -1,6 +1,6 @@
 {
   "a.html": 17277,
-  "a.html.gz": 7489,
+  "a.html.gz": 7486,
   "total": 17277,
-  "total_gz": 7489
+  "total_gz": 7486
 }
diff --git a/test/other/codesize/test_codesize_files_wasmfs.size b/test/other/codesize/test_codesize_files_wasmfs.size
@@ -1 +1 @@
-50948
+50942
diff --git a/test/test_browser.py b/test/test_browser.py
@@ -3270,10 +3270,12 @@ def test_modularize(self, opts):
       # this test is synchronous, so avoid async startup due to wasm features
       self.compile_btest('browser_test_hello_world.c', ['-sMODULARIZE', '-sSINGLE_FILE'] + args + opts)
       create_file('a.html', '''
+        <!DOCTYPE html><html lang="en"><head><meta charset="utf-8"></head><body>
         <script src="a.out.js"></script>
         <script>
           %s
         </script>
+        </body></html>
       ''' % code)
       self.run_browser('a.html', '/report_result?0')
 
@@ -4667,7 +4669,7 @@ def test_single_file_locate_file(self):
   # Tests that SINGLE_FILE works as intended in a Worker in JS output
   def test_single_file_worker_js(self):
     self.compile_btest('browser_test_hello_world.c', ['-o', 'test.js', '--proxy-to-worker', '-sSINGLE_FILE'])
-    create_file('test.html', '<script src="test.js"></script>')
+    create_file('test.html', '<!DOCTYPE html><html lang="en"><head><meta charset="utf-8"></head><body><script src="test.js"></script></body></html>')
     self.run_browser('test.html', '/report_result?0')
     self.assertExists('test.js')
     self.assertNotExists('test.worker.js')

diff --git a/test/test_other.py b/test/test_other.py
@@ -8536,7 +8536,7 @@ def test_unoptimized_code_size(self):
     # We don't care too about unoptimized code size but we would like to keep it
     # under control to a certain extent.  This test allows us to track major
     # changes to the size of the unoptimized and unminified code size.
-    # Run with `--rebase` when this test fails.
+    # Run with `--rebaseline` when this test fails.
     self.build(test_file('hello_world.c'), emcc_args=['-O0', '--output_eol=linux'])
     self.check_expected_size_in_file('wasm',
                                      test_file('other/test_unoptimized_code_size.wasm.size'),
@@ -9308,8 +9308,9 @@ def test_standalone_system_headers(self):
 
   @is_slow_test
   @parameterized({
-    '': (True,),
-    'disabled': (False,),
+    '': (1,),
+    'disabled': (0,),
+    'binary_encode': (2,),
   })
   @also_with_wasm2js
   def test_single_file(self, single_file_enabled):
@@ -9327,6 +9328,8 @@ def test_single_file(self, single_file_enabled):
       else:
         expect_wasm = self.is_wasm()
 
+      cmd += [f'-sSINGLE_FILE_BINARY_ENCODE={int(single_file_enabled == 2)}']
+
       if debug_enabled:
         cmd += ['-g']
       if closure_enabled:
@@ -10917,6 +10920,7 @@ def test_function_exports_are_small(self, args, opt, closure):
     'random_printf_wasm2js': ('random_printf', True),
     'hello_webgl_wasm': ('hello_webgl', False),
     'hello_webgl_wasm2js': ('hello_webgl', True),
+    'hello_webgl2_wasm_singlefile': ('hello_webgl2_wasm_singlefile', False),
     'hello_webgl2_wasm': ('hello_webgl2', False),
     'hello_webgl2_wasm2js': ('hello_webgl2', True),
     'math': ('math', False),
@@ -10963,6 +10967,7 @@ def test_minimal_runtime_code_size(self, test_name, js, compare_js_output=False)
                            '-lGL',
                            '-sMODULARIZE']
     hello_webgl2_sources = hello_webgl_sources + ['-sMAX_WEBGL_VERSION=2']
+    hello_webgl2_wasm_singlefile_sources = hello_webgl2_sources + ['-sSINGLE_FILE']
     hello_wasm_worker_sources = [test_file('wasm_worker/wasm_worker_code_size.c'), '-sWASM_WORKERS', '-sENVIRONMENT=web,worker']
     embind_hello_sources = [test_file('code_size/embind_hello_world.cpp'), '-lembind']
     embind_val_sources = [test_file('code_size/embind_val_hello_world.cpp'),
@@ -10977,6 +10982,7 @@ def test_minimal_runtime_code_size(self, test_name, js, compare_js_output=False)
       'hello_webgl': hello_webgl_sources,
       'math': math_sources,
       'hello_webgl2': hello_webgl2_sources,
+      'hello_webgl2_wasm_singlefile': hello_webgl2_wasm_singlefile_sources,
       'hello_wasm_worker': hello_wasm_worker_sources,
       'embind_val': embind_val_sources,
       'embind_hello': embind_hello_sources,

diff --git a/tools/building.py b/tools/building.py
@@ -599,6 +599,8 @@ def closure_compiler(filename, advanced=True, extra_closure_args=None):
   args += ['--language_out', 'NO_TRANSPILE']
   # Tell closure never to inject the 'use strict' directive.
   args += ['--emit_use_strict=false']
+  # Always output UTF-8 files, this helps generate UTF-8 code points instead of escaping code points with \uxxxx inside strings. https://github.com/google/closure-compiler/issues/4158
+  args += ['--charset=UTF8']
 
   if settings.IGNORE_CLOSURE_COMPILER_ERRORS:
     args.append('--jscomp_off=*')
@@ -649,7 +651,8 @@ def move_to_safe_7bit_ascii_filename(filename):
   # 7-bit ASCII range. Therefore make sure the command line we pass does not contain any such
   # input files by passing all input filenames relative to the cwd. (user temp directory might
   # be in user's home directory, and user's profile name might contain unicode characters)
-  proc = run_process(cmd, stderr=PIPE, check=False, env=env, cwd=tempfiles.tmpdir)
+  # https://github.com/google/closure-compiler/issues/4159: Closure outputs stdout/stderr in iso-8859-1 on Windows.
+  proc = run_process(cmd, stderr=PIPE, check=False, env=env, cwd=tempfiles.tmpdir, encoding='iso-8859-1' if WINDOWS else 'utf-8')
 
   # XXX Closure bug: if Closure is invoked with --create_source_map, Closure should create a
   # outfile.map source map file (https://github.com/google/closure-compiler/wiki/Source-Maps)

diff --git a/tools/link.py b/tools/link.py
@@ -149,6 +149,10 @@ def base64_encode(b):
   return b64.decode('ascii')
 
 
+def base64_or_binary_encode(b):
+  return binary_encode(b) if settings.SINGLE_FILE and settings.SINGLE_FILE_BINARY_ENCODE else base64_encode(b)
+
+
 def align_to_wasm_page_boundary(address):
   page_size = webassembly.WASM_PAGE_SIZE
   return ((address + (page_size - 1)) // page_size) * page_size
@@ -2339,7 +2343,7 @@ def phase_binaryen(target, options, wasm_target):
     js = read_file(final_js)
 
     if settings.MINIMAL_RUNTIME:
-      js = do_replace(js, '<<< WASM_BINARY_DATA >>>', base64_encode(read_binary(wasm_target)))
+      js = do_replace(js, '<<< WASM_BINARY_DATA >>>', base64_or_binary_encode(read_binary(wasm_target)))
     else:
       js = do_replace(js, '<<< WASM_BINARY_FILE >>>', get_subresource_location(wasm_target))
     delete_file(wasm_target)
@@ -2981,11 +2985,45 @@ def move_file(src, dst):
   shutil.move(src, dst)
 
 
+def binary_encode(data):
+  """This function encodes the given binary byte array to a UTF-8 string, by
+  first adding +1 to all the bytes [0, 255] to form values [1, 256], and then
+  encoding each of those values as UTF-8, except for specific byte values that
+  are escaped as two bytes. This kind of encoding results in a string that will
+  compress well by both gzip and brotli, unlike base64 encoding binary data
+  would do, and avoids emitting the null byte inside a string.
+  """
+
+  out = bytearray(len(data) * 2) # Size output buffer conservatively
+  i = 0
+  for d in data:
+    d += 1 # Offset all bytes up by +1 to make zero (a very common value) be encoded with only one byte as 0x01. This is possible since we can encode 255 as 0x100 in UTF-8.
+    if d == ord("'"):
+      buf = [ord('\\'), d] # Escape single quote ' character with a backspace since we are writing a string inside single quotes. (' -> 2 bytes)
+    elif d == ord('"'):
+      buf = [ord('\\'), d] # Escape double quote " character with a backspace since optimizer may turn the string into being delimited with double quotes. (" -> 2 bytes)
+    elif d == ord('\r'):
+      buf = [ord('\\'), ord('r')] # Escape carriage return 0x0D as \r -> 2 bytes
+    elif d == ord('\n'):
+      buf = [ord('\\'), ord('n')] # Escape newline 0x0A as \n -> 2 bytes
+    elif d == ord('\\'):
+      buf = [ord('\\'), ord('\\')] # Escape backslash \ as \\ -> 2 bytes
+    else:
+      buf = chr(d).encode('utf-8') # Otherwise write the original value encoded in UTF-8 (1 or 2 bytes).
+    for b in buf: # Write the bytes to output buffer
+      out[i] = b
+      i += 1
+  return out[0:i].decode('utf-8') # Crop output buffer to the actual used size
+
+
 # Returns the subresource location for run-time access
 def get_subresource_location(path):
   if settings.SINGLE_FILE:
-    data = base64.b64encode(utils.read_binary(path))
-    return 'data:application/octet-stream;base64,' + data.decode('ascii')
+    if settings.SINGLE_FILE_BINARY_ENCODE:
+      return binary_encode(utils.read_binary(path))
+    else:
+      data = base64.b64encode(utils.read_binary(path))
+      return 'data:application/octet-stream;base64,' + data.decode('ascii')
   else:
     return os.path.basename(path)