Skip to content

Commit 511a59e

Browse files
committed
Implement -s SINGLE_FILE_BINARY_ENCODE=1 option to embed Wasm binary as binary-encoded form instead of base64 form in SINGLE_FILE mode. Continuation of emscripten-core#21478.
1 parent 6ca3700 commit 511a59e

File tree

5 files changed

+76
-11
lines changed

5 files changed

+76
-11
lines changed

src/preamble.js

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,10 @@ function instrumentWasmTableWithAbort() {
410410
}
411411
#endif
412412

413+
#if SINGLE_FILE && SINGLE_FILE_BINARY_ENCODE && !WASM2JS
414+
#include "binaryDecode.js"
415+
#endif
416+
413417
#if !SOURCE_PHASE_IMPORTS && !WASM_ESM_INTEGRATION
414418
var wasmBinaryFile;
415419

@@ -423,27 +427,35 @@ function getWasmBinary(file) {}
423427
#else
424428

425429
function findWasmBinary() {
426-
#if SINGLE_FILE
430+
#if SINGLE_FILE && SINGLE_FILE_BINARY_ENCODE && !WASM2JS
431+
return binaryDecode('<<< WASM_BINARY_DATA >>>');
432+
#elif SINGLE_FILE
427433
return base64Decode('<<< WASM_BINARY_DATA >>>');
434+
#elif AUDIO_WORKLET || !EXPORT_ES6
435+
// For an Audio Worklet, we cannot use `new URL()`.
436+
return locateFile('{{{ WASM_BINARY_FILE }}}');
428437
#else
429-
#if EXPORT_ES6 && !AUDIO_WORKLET
430-
if (Module['locateFile']) {
431-
#endif
432-
return locateFile('{{{ WASM_BINARY_FILE }}}');
433-
#if EXPORT_ES6 && !AUDIO_WORKLET // For an Audio Worklet, we cannot use `new URL()`.
434-
}
438+
435439
#if ENVIRONMENT_MAY_BE_SHELL
436440
if (ENVIRONMENT_IS_SHELL) {
437441
return '{{{ WASM_BINARY_FILE }}}';
438442
}
439443
#endif
444+
445+
if (Module['locateFile']) {
446+
return locateFile('{{{ WASM_BINARY_FILE }}}');
447+
}
448+
440449
// Use bundler-friendly `new URL(..., import.meta.url)` pattern; works in browsers too.
441450
return new URL('{{{ WASM_BINARY_FILE }}}', import.meta.url).href;
442-
#endif
451+
443452
#endif
444453
}
445454

446455
function getBinarySync(file) {
456+
#if SINGLE_FILE && SINGLE_FILE_BINARY_ENCODE
457+
return file;
458+
#else
447459
#if SINGLE_FILE
448460
if (ArrayBuffer.isView(file)) {
449461
return file;
@@ -464,6 +476,7 @@ function getBinarySync(file) {
464476
#else
465477
throw 'sync fetching of the wasm failed: you can preload it to Module["wasmBinary"] manually, or emcc.py will do that for you when generating HTML (but not JS)';
466478
#endif
479+
#endif
467480
}
468481

469482
async function getWasmBinary(binaryFile) {

src/settings.js

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1853,6 +1853,13 @@ var WASMFS = false;
18531853
// [link]
18541854
var SINGLE_FILE = false;
18551855

1856+
// If true, binary Wasm content is encoded using a custom UTF-8 embedding
1857+
// instead of base64. This generates a smaller binary that compresses well.
1858+
// Set this to false to revert back to earlier base64 encoding if you run into
1859+
// issues with the binary encoding. (and please let us know of any such issues)
1860+
// [link]
1861+
var SINGLE_FILE_BINARY_ENCODE = true;
1862+
18561863
// If set to 1, all JS libraries will be automatically available at link time.
18571864
// This gets set to 0 in STRICT mode (or with MINIMAL_RUNTIME) which mean you
18581865
// need to explicitly specify -lfoo.js in at link time in order to access

test/test_codesize.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ class codesize(RunnerCore):
4343
'random_printf_wasm2js': ('random_printf', True),
4444
'hello_webgl_wasm': ('hello_webgl', False),
4545
'hello_webgl_wasm2js': ('hello_webgl', True),
46+
'hello_webgl2_wasm_singlefile': ('hello_webgl2_wasm_singlefile', False),
4647
'hello_webgl2_wasm': ('hello_webgl2', False),
4748
'hello_webgl2_wasm2js': ('hello_webgl2', True),
4849
'math': ('math', False),
@@ -87,6 +88,7 @@ def test_minimal_runtime_code_size(self, test_name, wasm2js, compare_js_output=F
8788
'-lGL',
8889
'-sMODULARIZE']
8990
hello_webgl2_sources = hello_webgl_sources + ['-sMAX_WEBGL_VERSION=2']
91+
hello_webgl2_wasm_singlefile_sources = hello_webgl2_sources + ['-sSINGLE_FILE']
9092
hello_wasm_worker_sources = [test_file('wasm_worker/wasm_worker_code_size.c'), '-sWASM_WORKERS', '-sENVIRONMENT=web']
9193
audio_worklet_sources = [test_file('webaudio/audioworklet.c'), '-sWASM_WORKERS', '-sAUDIO_WORKLET', '-sENVIRONMENT=web', '-sTEXTDECODER=1']
9294
embind_hello_sources = [test_file('codesize/embind_hello_world.cpp'), '-lembind']
@@ -98,6 +100,7 @@ def test_minimal_runtime_code_size(self, test_name, wasm2js, compare_js_output=F
98100
'hello_webgl': hello_webgl_sources,
99101
'math': math_sources,
100102
'hello_webgl2': hello_webgl2_sources,
103+
'hello_webgl2_wasm_singlefile': hello_webgl2_wasm_singlefile_sources,
101104
'hello_wasm_worker': hello_wasm_worker_sources,
102105
'audio_worklet': audio_worklet_sources,
103106
'embind_val': embind_val_sources,

test/test_other.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9475,8 +9475,9 @@ def test_standalone_system_headers(self, prefix):
94759475
'closure': (False, True),
94769476
})
94779477
@parameterized({
9478-
'': (True,),
9479-
'disabled': (False,),
9478+
'': (1,),
9479+
'disabled': (0,),
9480+
'binary_encode': (2,),
94809481
})
94819482
def test_single_file(self, debug_enabled, closure_enabled, single_file_enabled):
94829483
cmd = [EMCC, test_file('hello_world.c')] + self.get_cflags()
@@ -9487,6 +9488,8 @@ def test_single_file(self, debug_enabled, closure_enabled, single_file_enabled):
94879488
else:
94889489
expect_wasm = self.is_wasm()
94899490

9491+
cmd += [f'-sSINGLE_FILE_BINARY_ENCODE={int(single_file_enabled == 2)}']
9492+
94909493
if debug_enabled:
94919494
cmd += ['-g']
94929495
if closure_enabled:

tools/link.py

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,10 @@ def base64_encode(filename):
163163
return b64.decode('ascii')
164164

165165

166+
def base64_or_binary_encode(b):
167+
return binary_encode(b) if settings.SINGLE_FILE and settings.SINGLE_FILE_BINARY_ENCODE else base64_encode(b)
168+
169+
166170
def align_to_wasm_page_boundary(address):
167171
page_size = webassembly.WASM_PAGE_SIZE
168172
return ((address + (page_size - 1)) // page_size) * page_size
@@ -2435,7 +2439,7 @@ def phase_binaryen(target, options, wasm_target):
24352439
if final_js and settings.SINGLE_FILE and not settings.WASM2JS:
24362440
js = read_file(final_js)
24372441

2438-
js = do_replace(js, '<<< WASM_BINARY_DATA >>>', base64_encode(wasm_target))
2442+
js = do_replace(js, '<<< WASM_BINARY_DATA >>>', base64_or_binary_encode(wasm_target))
24392443
delete_file(wasm_target)
24402444
write_file(final_js, js)
24412445

@@ -2945,9 +2949,44 @@ def move_file(src, dst):
29452949
shutil.move(src, dst)
29462950

29472951

2952+
def binary_encode(filename):
2953+
"""This function encodes the given binary byte array to a UTF-8 string, by
2954+
first adding +1 to all the bytes [0, 255] to form values [1, 256], and then
2955+
encoding each of those values as UTF-8, except for specific byte values that
2956+
are escaped as two bytes. This kind of encoding results in a string that will
2957+
compress well by both gzip and brotli, unlike base64 encoding binary data
2958+
would do, and avoids emitting the null byte inside a string.
2959+
"""
2960+
2961+
data = utils.read_binary(filename)
2962+
2963+
out = bytearray(len(data) * 2) # Size output buffer conservatively
2964+
i = 0
2965+
for d in data:
2966+
d += 1 # Offset all bytes up by +1 to make zero (a very common value) be encoded with only one byte as 0x01. This is possible since we can encode 255 as 0x100 in UTF-8.
2967+
if d == ord("'"):
2968+
buf = [ord('\\'), d] # Escape single quote ' character with a backspace since we are writing a string inside single quotes. (' -> 2 bytes)
2969+
elif d == ord('"'):
2970+
buf = [ord('\\'), d] # Escape double quote " character with a backspace since optimizer may turn the string into being delimited with double quotes. (" -> 2 bytes)
2971+
elif d == ord('\r'):
2972+
buf = [ord('\\'), ord('r')] # Escape carriage return 0x0D as \r -> 2 bytes
2973+
elif d == ord('\n'):
2974+
buf = [ord('\\'), ord('n')] # Escape newline 0x0A as \n -> 2 bytes
2975+
elif d == ord('\\'):
2976+
buf = [ord('\\'), ord('\\')] # Escape backslash \ as \\ -> 2 bytes
2977+
else:
2978+
buf = chr(d).encode('utf-8') # Otherwise write the original value encoded in UTF-8 (1 or 2 bytes).
2979+
for b in buf: # Write the bytes to output buffer
2980+
out[i] = b
2981+
i += 1
2982+
return out[0:i].decode('utf-8') # Crop output buffer to the actual used size
2983+
2984+
29482985
# Returns the subresource location for run-time access
29492986
def get_subresource_location(path, mimetype='application/octet-stream'):
29502987
if settings.SINGLE_FILE:
2988+
if settings.SINGLE_FILE_BINARY_ENCODE:
2989+
return binary_encode(utils.read_binary(path))
29512990
return f'data:{mimetype};base64,{base64_encode(path)}'
29522991
else:
29532992
return os.path.basename(path)

0 commit comments

Comments
 (0)