-
Notifications
You must be signed in to change notification settings - Fork 3.5k
Embed wasm by encoding binary data as UTF-8 code points. #21478
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3b9e5e3
059699a
735af53
ed9b7e1
926fde4
85d6ea4
e199eb0
23007c1
451fc77
cdd1719
d4c8067
6435c68
426c9d2
ca63b7b
a27937f
7eb94f2
66db56c
46fa3b8
0fad312
bb8f0f5
f8d9641
0317531
3ac3aab
514759b
77badc9
ea729f0
1b163d2
09c5aa8
da757b9
9abe835
b7e9a2c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,10 @@ | ||
| // Prevent Closure from minifying the binaryDecode() function, or otherwise | ||
| // Closure may analyze through the WASM_BINARY_DATA placeholder string into this | ||
| // function, leading into incorrect results. | ||
| /** @noinline */ | ||
| function binaryDecode(bin) { | ||
| for(var i = 0, l = bin.length, o = new Uint8Array(l); i < l; ++i) { | ||
| o[i] = bin.charCodeAt(i) - 1; | ||
| } | ||
| return o; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -129,13 +129,18 @@ var SUPPORT_BASE64_EMBEDDING; | |
| var filename; | ||
| filename ||= '<<< filename >>>'; | ||
|
|
||
| #if SINGLE_FILE && SINGLE_FILE_BINARY_ENCODE | ||
| #include "binaryDecode.js" | ||
| var workerURL = URL.createObjectURL(new Blob([binaryDecode(filename)], {type: 'application/javascript'})); | ||
| #else | ||
| var workerURL = filename; | ||
| if (SUPPORT_BASE64_EMBEDDING) { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unrelated to this change but this line looks very strange. Shouldn't this be
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| var fileBytes = tryParseAsDataURI(filename); | ||
| if (fileBytes) { | ||
| workerURL = URL.createObjectURL(new Blob([fileBytes], {type: 'application/javascript'})); | ||
| } | ||
| } | ||
| #endif | ||
| var worker = new Worker(workerURL); | ||
|
|
||
| #if ENVIRONMENT_MAY_BE_NODE | ||
|
|
@@ -166,7 +171,11 @@ worker.onmessage = (event) => { | |
| if (!workerResponded) { | ||
| workerResponded = true; | ||
| Module.setStatus?.(''); | ||
| #if SINGLE_FILE && SINGLE_FILE_BINARY_ENCODE | ||
| URL.revokeObjectURL(workerURL); | ||
| #else | ||
| if (SUPPORT_BASE64_EMBEDDING && workerURL !== filename) URL.revokeObjectURL(workerURL); | ||
| #endif | ||
| } | ||
|
|
||
| var data = event.data; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1859,6 +1859,12 @@ var WASMFS = false; | |
| // [link] | ||
| var SINGLE_FILE = false; | ||
|
|
||
| // If true, binary Wasm content is encoded using a custom UTF-8 embedding | ||
| // instead of base64. This generates smaller binary. | ||
| // Set this to false to revert back to earlier base64 encoding if you run into | ||
| // issues with the binary encoding. (and please let us know of any such issues) | ||
| var SINGLE_FILE_BINARY_ENCODE = true; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If this is Is it because some folks can't handle UTF-8? If so, it might be worth mentioning there. e.g. "Disable this if you can't handle UTF-8 chars in the generated JS".
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The idea is that it would allow developers to pass Updated the text doc. |
||
|
|
||
| // If set to 1, all JS libraries will be automatically available at link time. | ||
| // This gets set to 0 in STRICT mode (or with MINIMAL_RUNTIME) which mean you | ||
| // need to explicitly specify -lfoo.js in at link time in order to access | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| { | ||
| "a.html": 17586, | ||
| "a.html.gz": 10152, | ||
| "total": 17586, | ||
| "total_gz": 10152 | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,6 @@ | ||
| { | ||
| "a.html": 12690, | ||
| "a.html.gz": 6857, | ||
| "total": 12690, | ||
| "total_gz": 6857 | ||
| "a.html": 11058, | ||
| "a.html.gz": 5724, | ||
| "total": 11058, | ||
| "total_gz": 5724 | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,6 @@ | ||
| { | ||
| "a.html": 17277, | ||
| "a.html.gz": 7489, | ||
| "a.html.gz": 7486, | ||
| "total": 17277, | ||
| "total_gz": 7489 | ||
| "total_gz": 7486 | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1 +1 @@ | ||
| 50948 | ||
| 50942 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3270,10 +3270,12 @@ def test_modularize(self, opts): | |
| # this test is synchronous, so avoid async startup due to wasm features | ||
| self.compile_btest('browser_test_hello_world.c', ['-sMODULARIZE', '-sSINGLE_FILE'] + args + opts) | ||
| create_file('a.html', ''' | ||
| <!DOCTYPE html><html lang="en"><head><meta charset="utf-8"></head><body> | ||
| <script src="a.out.js"></script> | ||
| <script> | ||
| %s | ||
| </script> | ||
| </body></html> | ||
| ''' % code) | ||
| self.run_browser('a.html', '/report_result?0') | ||
|
|
||
|
|
@@ -4667,7 +4669,7 @@ def test_single_file_locate_file(self): | |
| # Tests that SINGLE_FILE works as intended in a Worker in JS output | ||
| def test_single_file_worker_js(self): | ||
| self.compile_btest('browser_test_hello_world.c', ['-o', 'test.js', '--proxy-to-worker', '-sSINGLE_FILE']) | ||
| create_file('test.html', '<script src="test.js"></script>') | ||
| create_file('test.html', '<!DOCTYPE html><html lang="en"><head><meta charset="utf-8"></head><body><script src="test.js"></script></body></html>') | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this test break without this change? Does the charset in the Does the mean that some/all users of
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Our existing shell files have already ~from the dawn of time had a So we can expect quite safely that all the custom shell files that users have monkeyed off of any of the existing shell files we provide, will be UTF-8 (unless they have explicitly removed the meta charset directive, which is a bad idea in general) About 98.2% of all websites utilize UTF-8, it is practically the only encoding used on the web. I don't really know of anyone who would use any other encoding on their web pages anymore. It is just that our tests scripts have omitted the meta charset='utf-8' directive. (If users did not want for some reason to specify that meta charset directive, they can also save the HTML file with a UTF-8 BOM marker, or they can send a
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To be clear, without this change, this test is broken? I wonder if we should write a test to verify that I guess we don't have any other non-ascii strings in our generated code otherwise
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
That's correct, the test would not work otherwise.
I am not sure what would be the benefit of such test? If we wanted to go the extra mile, I think we would want to either generate a UTF-8 BOM into the HTML file, or issue a link time warning if generated HTML file contains UTF-8 code points, but the HTML file does not contain a
That is true. We (Emscripten) don't, but if users would have any UTF-8 chars they contain in JS libraries, then that would require them to specify the charset.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This would a useful warning for users who generate HTML output but my understanding is that the vast majority of our users generate JavaScript and then write their own HTML separetely. I'm curious, does unity use emscripten to generate HTML? Is so I guess that would be big use case for HTML output in production. Otherwise I don't know of any. |
||
| self.run_browser('test.html', '/report_result?0') | ||
| self.assertExists('test.js') | ||
| self.assertNotExists('test.worker.js') | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -599,6 +599,8 @@ def closure_compiler(filename, advanced=True, extra_closure_args=None): | |
| args += ['--language_out', 'NO_TRANSPILE'] | ||
| # Tell closure never to inject the 'use strict' directive. | ||
| args += ['--emit_use_strict=false'] | ||
| # Always output UTF-8 files, this helps generate UTF-8 code points instead of escaping code points with \uxxxx inside strings. https://github.com/google/closure-compiler/issues/4158 | ||
| args += ['--charset=UTF8'] | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd like to see this land separately too if possible, along with a test that closure no longer produces
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This effect is already verified by the existing tests at https://github.com/emscripten-core/emscripten/pull/21478/files#diff-95c5e9a22c0a073731d32eb5c839692fd5851db19b919d30f14c2fde0c7c261bR2-R5, so we should be covered?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd still love to see this change (along with the test it effects) land separately, but I won't force you to do it.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh wait the change in file size you are referring to is due to I was wondering if this change would effect other tests.. In theory it should effect any JS code which includes a UTF-8 string, right? Regardless of
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yeah, this would specify the encoding to be UTF-8 for all output when pushed through Closure. |
||
|
|
||
| if settings.IGNORE_CLOSURE_COMPILER_ERRORS: | ||
| args.append('--jscomp_off=*') | ||
|
|
@@ -649,7 +651,8 @@ def move_to_safe_7bit_ascii_filename(filename): | |
| # 7-bit ASCII range. Therefore make sure the command line we pass does not contain any such | ||
| # input files by passing all input filenames relative to the cwd. (user temp directory might | ||
| # be in user's home directory, and user's profile name might contain unicode characters) | ||
| proc = run_process(cmd, stderr=PIPE, check=False, env=env, cwd=tempfiles.tmpdir) | ||
| # https://github.com/google/closure-compiler/issues/4159: Closure outputs stdout/stderr in iso-8859-1 on Windows. | ||
| proc = run_process(cmd, stderr=PIPE, check=False, env=env, cwd=tempfiles.tmpdir, encoding='iso-8859-1' if WINDOWS else 'utf-8') | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this and existing bug, or does it only show up with
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an Closure compiler bug. It occurs both with and without I don't know what is the default charset used by Closure. |
||
|
|
||
| # XXX Closure bug: if Closure is invoked with --create_source_map, Closure should create a | ||
| # outfile.map source map file (https://github.com/google/closure-compiler/wiki/Source-Maps) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -149,6 +149,10 @@ def base64_encode(b): | |
| return b64.decode('ascii') | ||
|
|
||
|
|
||
| def base64_or_binary_encode(b): | ||
| return binary_encode(b) if settings.SINGLE_FILE and settings.SINGLE_FILE_BINARY_ENCODE else base64_encode(b) | ||
|
|
||
|
|
||
| def align_to_wasm_page_boundary(address): | ||
| page_size = webassembly.WASM_PAGE_SIZE | ||
| return ((address + (page_size - 1)) // page_size) * page_size | ||
|
|
@@ -2339,7 +2343,7 @@ def phase_binaryen(target, options, wasm_target): | |
| js = read_file(final_js) | ||
|
|
||
| if settings.MINIMAL_RUNTIME: | ||
| js = do_replace(js, '<<< WASM_BINARY_DATA >>>', base64_encode(read_binary(wasm_target))) | ||
| js = do_replace(js, '<<< WASM_BINARY_DATA >>>', base64_or_binary_encode(read_binary(wasm_target))) | ||
| else: | ||
| js = do_replace(js, '<<< WASM_BINARY_FILE >>>', get_subresource_location(wasm_target)) | ||
| delete_file(wasm_target) | ||
|
|
@@ -2981,11 +2985,45 @@ def move_file(src, dst): | |
| shutil.move(src, dst) | ||
|
|
||
|
|
||
| def binary_encode(data): | ||
juj marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| """This function encodes the given binary byte array to a UTF-8 string, by | ||
| first adding +1 to all the bytes [0, 255] to form values [1, 256], and then | ||
| encoding each of those values as UTF-8, except for specific byte values that | ||
| are escaped as two bytes. This kind of encoding results in a string that will | ||
| compress well by both gzip and brotli, unlike base64 encoding binary data | ||
| would do, and avoids emitting the null byte inside a string. | ||
| """ | ||
|
|
||
| out = bytearray(len(data) * 2) # Size output buffer conservatively | ||
| i = 0 | ||
| for d in data: | ||
| d += 1 # Offset all bytes up by +1 to make zero (a very common value) be encoded with only one byte as 0x01. This is possible since we can encode 255 as 0x100 in UTF-8. | ||
| if d == ord("'"): | ||
| buf = [ord('\\'), d] # Escape single quote ' character with a backspace since we are writing a string inside single quotes. (' -> 2 bytes) | ||
| elif d == ord('"'): | ||
| buf = [ord('\\'), d] # Escape double quote " character with a backspace since optimizer may turn the string into being delimited with double quotes. (" -> 2 bytes) | ||
| elif d == ord('\r'): | ||
| buf = [ord('\\'), ord('r')] # Escape carriage return 0x0D as \r -> 2 bytes | ||
| elif d == ord('\n'): | ||
| buf = [ord('\\'), ord('n')] # Escape newline 0x0A as \n -> 2 bytes | ||
| elif d == ord('\\'): | ||
| buf = [ord('\\'), ord('\\')] # Escape backslash \ as \\ -> 2 bytes | ||
| else: | ||
| buf = chr(d).encode('utf-8') # Otherwise write the original value encoded in UTF-8 (1 or 2 bytes). | ||
| for b in buf: # Write the bytes to output buffer | ||
| out[i] = b | ||
| i += 1 | ||
| return out[0:i].decode('utf-8') # Crop output buffer to the actual used size | ||
|
|
||
|
|
||
| # Returns the subresource location for run-time access | ||
| def get_subresource_location(path): | ||
| if settings.SINGLE_FILE: | ||
| data = base64.b64encode(utils.read_binary(path)) | ||
| return 'data:application/octet-stream;base64,' + data.decode('ascii') | ||
| if settings.SINGLE_FILE_BINARY_ENCODE: | ||
| return binary_encode(utils.read_binary(path)) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this not need some kind of
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, these binary embedded strings are not passed to the browser to decode as Data URIs, so specifying a MIME type would be meaningless. |
||
| else: | ||
| data = base64.b64encode(utils.read_binary(path)) | ||
| return 'data:application/octet-stream;base64,' + data.decode('ascii') | ||
| else: | ||
| return os.path.basename(path) | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about calling this
SUPPORT_UTF8_EMBEDDINGto match the existingSUPPORT_BASE64_EMBEDDING?I do wish we could make this change without adding yet another setting, but I guess its not reasonable to make it unconditional. Do you think we could make it unconditional at some point in the future?