-
Notifications
You must be signed in to change notification settings - Fork 1.8k
New unicode-graphemes addon. #4519
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2f7fe11
67e9689
41760df
8e730cd
9c34bb4
466501c
284cde8
68abf10
ebfa201
3003185
e87fa16
1a8c7da
fa89e17
cf403bc
693f6b2
dc6818d
5568233
7baff4e
08a0914
6b24593
4204215
dccfc11
a0dc881
88bfc6b
988b3b1
7202979
f103540
b9abbc0
4fbc623
302da18
b9a4760
3bfe5d8
9b21786
fb2c680
aac5d47
cf243ad
5994e25
e3d3cdb
f1bc956
2070cc4
6c1eb97
21fd545
00fd2b8
e21e525
2204375
e5a2ebd
5ccf4d3
dc6dd6d
44e1977
630d213
930e5c4
9963c58
427c472
bddff60
a003034
1b5756c
17c4d09
9e536ca
24b932d
9ab0cd6
8183d1d
32c6165
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| lib | ||
| node_modules | ||
| out-benchmark |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,29 @@ | ||
| # Blacklist - exclude everything except npm defaults such as LICENSE, etc | ||
| * | ||
| !*/ | ||
|
|
||
| # Whitelist - lib/ | ||
| !lib/**/*.d.ts | ||
|
|
||
| !lib/**/*.js | ||
| !lib/**/*.js.map | ||
|
|
||
| !lib/**/*.css | ||
|
|
||
| # Whitelist - src/ | ||
| !src/**/*.ts | ||
| !src/**/*.d.ts | ||
|
|
||
| !src/**/*.js | ||
| !src/**/*.js.map | ||
|
|
||
| !src/**/*.css | ||
|
|
||
| # Blacklist - src/ test files | ||
| src/**/*.test.ts | ||
| src/**/*.test.d.ts | ||
| src/**/*.test.js | ||
| src/**/*.test.js.map | ||
|
|
||
| # Whitelist - typings/ | ||
| !typings/*.d.ts |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| Copyright (c) 2023, The xterm.js authors (https://github.com/xtermjs/xterm.js) | ||
|
|
||
| Permission is hereby granted, free of charge, to any person obtaining a copy | ||
| of this software and associated documentation files (the "Software"), to deal | ||
| in the Software without restriction, including without limitation the rights | ||
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
| copies of the Software, and to permit persons to whom the Software is | ||
| furnished to do so, subject to the following conditions: | ||
|
|
||
| The above copyright notice and this permission notice shall be included in | ||
| all copies or substantial portions of the Software. | ||
|
|
||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
| THE SOFTWARE. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| ## xterm-addon-unicode-graphemes | ||
|
|
||
| ⚠️ **This addon is currently experimental and may introduce unexpected and non-standard behavior** | ||
|
|
||
| An addon providing enhanced Unicode support (include grapheme clustering) for xterm.js. | ||
|
|
||
| The file `src/UnicodeProperties.ts` is generated and depends on the Unicode version. See [the unicode-properties project](https://github.com/PerBothner/unicode-properties) for credits and re-generation instructions. | ||
|
|
||
| ### Install | ||
|
|
||
| ```bash | ||
| npm install --save xterm-addon-unicode-graphemes | ||
| ``` | ||
|
|
||
| ### Usage | ||
|
|
||
| ```ts | ||
| import { Terminal } from 'xterm'; | ||
| import { UnicodeGraphemeAddon } from 'xterm-addon-unicode-graphemes'; | ||
|
|
||
| const terminal = new Terminal(); | ||
| const unicodeGraphemeAddon = new UnicodeGraphemeAddon(); | ||
| terminal.loadAddon(unicodeGraphemeAddon); | ||
| ``` |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,78 @@ | ||
| /** | ||
| * Copyright (c) 2019 The xterm.js authors. All rights reserved. | ||
| * @license MIT | ||
| */ | ||
|
|
||
| import { perfContext, before, ThroughputRuntimeCase } from 'xterm-benchmark'; | ||
|
|
||
| import { spawn } from 'node-pty'; | ||
| import { Utf8ToUtf32, stringFromCodePoint } from 'common/input/TextDecoder'; | ||
| import { Terminal } from 'browser/Terminal'; | ||
| import { UnicodeGraphemeProvider } from 'UnicodeGraphemeProvider'; | ||
|
|
||
|
|
||
| function fakedAddonLoad(terminal: any): void { | ||
| // resembles what UnicodeGraphemesAddon.activate does under the hood | ||
| terminal.unicodeService.register(new UnicodeGraphemeProvider()); | ||
| terminal.unicodeService.activeVersion = '15-graphemes'; | ||
| } | ||
|
|
||
|
|
||
| perfContext('Terminal: ls -lR /usr/lib', () => { | ||
| let content = ''; | ||
| let contentUtf8: Uint8Array; | ||
|
|
||
| before(async () => { | ||
| // grab output from "ls -lR /usr" | ||
| const p = spawn('ls', ['--color=auto', '-lR', '/usr/lib'], { | ||
| name: 'xterm-256color', | ||
| cols: 80, | ||
| rows: 25, | ||
| cwd: process.env.HOME, | ||
| env: process.env, | ||
| encoding: (null as unknown as string) // needs to be fixed in node-pty | ||
| }); | ||
| const chunks: Buffer[] = []; | ||
| let length = 0; | ||
| p.on('data', data => { | ||
| chunks.push(data as unknown as Buffer); | ||
| length += data.length; | ||
| }); | ||
| await new Promise<void>(resolve => p.on('exit', () => resolve())); | ||
| contentUtf8 = Buffer.concat(chunks, length); | ||
| // translate to content string | ||
| const buffer = new Uint32Array(contentUtf8.length); | ||
| const decoder = new Utf8ToUtf32(); | ||
| const codepoints = decoder.decode(contentUtf8, buffer); | ||
| for (let i = 0; i < codepoints; ++i) { | ||
| content += stringFromCodePoint(buffer[i]); | ||
| // peek into content to force flat repr in v8 | ||
| if (!(i % 10000000)) { | ||
| content[i]; | ||
| } | ||
| } | ||
| }); | ||
|
|
||
| perfContext('write/string/async', () => { | ||
| let terminal: Terminal; | ||
| before(() => { | ||
| terminal = new Terminal({ cols: 80, rows: 25, scrollback: 1000 }); | ||
| fakedAddonLoad(terminal); | ||
| }); | ||
| new ThroughputRuntimeCase('', async () => { | ||
| await new Promise<void>(res => terminal.write(content, res)); | ||
| return { payloadSize: contentUtf8.length }; | ||
| }, { fork: false }).showAverageThroughput(); | ||
| }); | ||
|
|
||
| perfContext('write/Utf8/async', () => { | ||
| let terminal: Terminal; | ||
| before(() => { | ||
| terminal = new Terminal({ cols: 80, rows: 25, scrollback: 1000 }); | ||
| }); | ||
| new ThroughputRuntimeCase('', async () => { | ||
| await new Promise<void>(res => terminal.write(content, res)); | ||
| return { payloadSize: contentUtf8.length }; | ||
| }, { fork: false }).showAverageThroughput(); | ||
| }); | ||
| }); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| { | ||
| "APP_PATH": ".benchmark", | ||
| "evalConfig": { | ||
| "tolerance": { | ||
| "*": [0.75, 1.5], | ||
| "*.dev": [0.01, 1.5], | ||
| "*.cv": [0.01, 1.5], | ||
| "EscapeSequenceParser.benchmark.js.*.averageThroughput.mean": [0.9, 5] | ||
| }, | ||
| "skip": [ | ||
| "*.median", | ||
| "*.runs", | ||
| "*.dev", | ||
| "*.cv", | ||
| "EscapeSequenceParser.benchmark.js.*.averageRuntime", | ||
| "Terminal.benchmark.js.*.averageRuntime" | ||
| ] | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| { | ||
| "compilerOptions": { | ||
| "lib": ["dom", "es6"], | ||
| "outDir": "../out-benchmark", | ||
| "types": ["../../../node_modules/@types/node"], | ||
| "moduleResolution": "node", | ||
| "strict": false, | ||
| "target": "es2015", | ||
| "module": "commonjs", | ||
| "baseUrl": ".", | ||
| "paths": { | ||
| "common/*": ["../../../src/common/*"], | ||
| "browser/*": ["../../../src/browser/*"], | ||
| "UnicodeGraphemeProvider": ["../src/UnicodeGraphemeProvider"] | ||
| } | ||
| }, | ||
| "include": ["../**/*", "../../../typings/xterm.d.ts"], | ||
| "exclude": ["../../../**/*test.ts", "../../**/*api.ts"], | ||
| "references": [ | ||
| { "path": "../../../src/common" }, | ||
| { "path": "../../../src/browser" } | ||
| ] | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,29 @@ | ||
| { | ||
| "name": "xterm-addon-unicode-graphemes", | ||
Tyriar marked this conversation as resolved.
Show resolved
Hide resolved
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should stick to the major version of unicode to separate the unicode support. Thats easier to grasp for ppl. For the grapheme vs not-grapheme support I suggest to expose 2 providers from the addon, one with and one w'o support.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where do you draw the line? The non-grapheme variant should still handle simple clusters that consist of just a single base character followed by some modifiers. Otherwise it's a regression. Without having examined the issue, the main differentiator is whether to ignore ZWJ (zero-width-joiner), thus treating true compounds as separate clusters. Handling of Regional Indicators (flags) is another issue. It is certainly possible to add a flag that restores the old non-standard behavior.. I'd rather not (I'm not sure it's needed), but I'll do whatever you and @Tyriar agree.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well the problem here is, that many cmdline apps are simply not segmentation aware, unless the system provides better unicode support for them, and instead just add Esp. macos is likely to create issues here, as it delivers a very old So to separate grapheme support from non grapheme support, I would not deactivate certain segmentation rules, but the rule handling completely falling back to old wcwidth measuring. Note that grapheme handling was not even a thing in terminals until recently, so apps are very likely to get this wrong until they adopted it as well.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So the question is: If you paste a string containing a compound cluster into an input field in an application that just uses wcwidth and doesn't know anything about clusters, what will happen? Assume the application knows enough that wcwidth==0 means a combination. What should happen? Is this a use-case we should worry about? In addition to spacing and alignment issues, consider what happens when editing the input field: The application thinks the cursor is one place, while the terminal thinks it is another place. Is this something we should support? Is suggesting that people fall back to unicode6 or unicode11 unacceptable? It is probably not too difficult for the addon to provide two implementations of I haven't looked into how much would be involved.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Whatever is defined in the selected unicode version + ruleset, e.g. for no-grapheme support the old wcwidth calc.
Yes, it misses a lot of codepoints. V6 is even worse, as codepoints changed metrics in between.
Isn't it just a bypass for the joining, as the width has to be pulled anyway? I know the situation with unicode is really messed up in terminals, even worse for xtermjs, as it cannot even determine a sane default from the system it runs on (other desktop TEs can do that). Regarding grapheme support in terminals we are in an "infrastructure trap" - someone has to make the first move (here either TE or app side), and the other side should follow. Thus I think grapheme handling should be on by default. But making it mandatory is imho wrong, as it is still pretty new in TE context. |
||
| "version": "0.1.0", | ||
| "author": { | ||
| "name": "The xterm.js authors", | ||
| "url": "https://xtermjs.org/" | ||
| }, | ||
| "main": "lib/xterm-addon-unicode-graphemes.js", | ||
| "types": "typings/xterm-addon-unicode-graphemes.d.ts", | ||
| "repository": "https://github.com/xtermjs/xterm.js/tree/master/addons/xterm-addon-unicode-graphemes", | ||
| "license": "MIT", | ||
| "keywords": [ | ||
| "terminal", | ||
| "xterm", | ||
| "xterm.js" | ||
| ], | ||
| "scripts": { | ||
| "build": "../../node_modules/.bin/tsc -p .", | ||
| "prepackage": "npm run build", | ||
| "package": "../../node_modules/.bin/webpack", | ||
| "prepublishOnly": "npm run package", | ||
| "benchmark": "NODE_PATH=../../out:./out:./out-benchmark/ ../../node_modules/.bin/xterm-benchmark -r 5 -c benchmark/benchmark.json out-benchmark/benchmark/*benchmark.js", | ||
| "benchmark-baseline": "NODE_PATH=../../out:./out:./out-benchmark/ ../../node_modules/.bin/xterm-benchmark -r 5 -c benchmark/benchmark.json --baseline out-benchmark/benchmark/*benchmark.js", | ||
| "benchmark-eval": "NODE_PATH=../../out:./out:./out-benchmark/ ../../node_modules/.bin/xterm-benchmark -r 5 -c benchmark/benchmark.json --eval out-benchmark/benchmark/*benchmark.js" | ||
| }, | ||
| "peerDependencies": { | ||
| "xterm": "^5.0.0" | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,72 @@ | ||
| /** | ||
| * Copyright (c) 2023 The xterm.js authors. All rights reserved. | ||
| * @license MIT | ||
| */ | ||
|
|
||
| import { IUnicodeVersionProvider } from 'xterm'; | ||
| import { UnicodeCharProperties, UnicodeCharWidth } from 'common/services/Services'; | ||
| import { UnicodeService } from 'common/services/UnicodeService'; | ||
| import * as UC from './third-party/UnicodeProperties'; | ||
|
|
||
| export class UnicodeGraphemeProvider implements IUnicodeVersionProvider { | ||
| public readonly version; | ||
| public ambiguousCharsAreWide: boolean = false; | ||
| public readonly handleGraphemes: boolean; | ||
|
|
||
| constructor(handleGraphemes: boolean = true) { | ||
| this.version = handleGraphemes ? '15-graphemes' : '15'; | ||
| this.handleGraphemes = handleGraphemes; | ||
| } | ||
|
|
||
| private static readonly _plainNarrowProperties: UnicodeCharProperties | ||
| = UnicodeService.createPropertyValue(UC.GRAPHEME_BREAK_Other, 1, false); | ||
|
|
||
| public charProperties(codepoint: number, preceding: UnicodeCharProperties): UnicodeCharProperties { | ||
| // Optimize the simple ASCII case, under the condition that | ||
| // UnicodeService.extractCharKind(preceding) === GRAPHEME_BREAK_Other | ||
| // (which also covers the case that preceding === 0). | ||
| if ((codepoint >= 32 && codepoint < 127) && (preceding >> 3) === 0) { | ||
| return UnicodeGraphemeProvider._plainNarrowProperties; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hot path optimization: (Note thats only possible, if
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While my TypeScript knowledge is limited, I'd be surprised if you could initialize a const enum value to that of an expression involving a function call. However, it is "const global data" in that it is a value that would not depend on Unicode version. We can manually pre-compute the value, and use that to define an enum const. We can depend on the test suite to confirm that we didn't mess up the calculation. |
||
| } | ||
|
|
||
| let charInfo = UC.getInfo(codepoint); | ||
| let w = UC.infoToWidthInfo(charInfo); | ||
| let shouldJoin = false; | ||
| if (w >= 2) { | ||
| // Treat emoji_presentation_selector as WIDE. | ||
| w = w === 3 || this.ambiguousCharsAreWide || codepoint === 0xfe0f ? 2 : 1; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this need an explicit handling of VS15 (U+FE0E) as well? Imho VS15 should always turn emojis into text repr with only 1 cell width.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We really needed an update to the ancient East Asian Widths TR to deal with grapheme clusters and more. In addition to VS15, it would be nice to have a recommendation on how wide en/em dashes/spaces should be. @Tyriar If perchance you have a contact that is involved with Unicode standardization, perhaps check with them?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @PerBothner I think they dont feel obligated to solve the monospace issue for us:
That note got added to TR-11 with unicode version 11. It basically says - "it is much more complicated, and we dont care to spec things out for the terminal". 🙈
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah well, I am not sure anymore, if VS15 should reduce emojis to 1 cell width. For VS16 it is explicitly mentioned that pictograms should be wide. There no such notion for VS15, which makes them somewhat unclear again (if width property should be applied, which can be narrow or wide for different symbols, or whether they still stick to wide). What a nonsense...
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No contact 🙁
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To not block on a silly thing like VS15, maybe just add a comment somewhere, that VS15 is not treated special in width handling. |
||
| } else { | ||
| w = 1; | ||
| } | ||
| if (preceding !== 0) { | ||
| const oldWidth = UnicodeService.extractWidth(preceding); | ||
| if (this.handleGraphemes) { | ||
| charInfo = UC.shouldJoin(UnicodeService.extractCharKind(preceding), charInfo); | ||
| } else { | ||
| charInfo = w === 0 ? 1 : 0; | ||
| } | ||
| shouldJoin = charInfo > 0; | ||
| if (shouldJoin) { | ||
| if (oldWidth > w) { | ||
| w = oldWidth; | ||
| } else if (charInfo === 32) { // UC.GRAPHEME_BREAK_SAW_Regional_Pair) | ||
| w = 2; | ||
| } | ||
| } | ||
| } | ||
| return UnicodeService.createPropertyValue(charInfo, w, shouldJoin); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as comment above. |
||
| } | ||
|
|
||
| public wcwidth(codepoint: number): UnicodeCharWidth { | ||
| const charInfo = UC.getInfo(codepoint); | ||
| const w = UC.infoToWidthInfo(charInfo); | ||
| const kind = (charInfo & UC.GRAPHEME_BREAK_MASK) >> UC.GRAPHEME_BREAK_SHIFT; | ||
| if (kind === UC.GRAPHEME_BREAK_Extend || kind === UC.GRAPHEME_BREAK_Prepend) { | ||
| return 0; | ||
| } | ||
| if (w >= 2 && (w === 3 || this.ambiguousCharsAreWide)) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again possibly affected by VS15? |
||
| return 2; | ||
| } | ||
| return 1; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,38 @@ | ||
| /** | ||
| * Copyright (c) 2023 The xterm.js authors. All rights reserved. | ||
| * @license MIT | ||
| * | ||
| * UnicodeVersionProvider for V15 with grapeme cluster handleing. | ||
| */ | ||
|
|
||
| import { Terminal, ITerminalAddon, IUnicodeHandling } from 'xterm'; | ||
| import { UnicodeGraphemeProvider } from './UnicodeGraphemeProvider'; | ||
|
|
||
|
|
||
| export class UnicodeGraphemesAddon implements ITerminalAddon { | ||
| private _provider15Graphemes?: UnicodeGraphemeProvider; | ||
| private _provider15?: UnicodeGraphemeProvider; | ||
| private _unicode?: IUnicodeHandling; | ||
| private _oldVersion: string = ''; | ||
|
|
||
| public activate(terminal: Terminal): void { | ||
| if (! this._provider15) { | ||
| this._provider15 = new UnicodeGraphemeProvider(false); | ||
| } | ||
| if (! this._provider15Graphemes) { | ||
| this._provider15Graphemes = new UnicodeGraphemeProvider(true); | ||
| } | ||
| const unicode = terminal.unicode; | ||
| this._unicode = unicode; | ||
| unicode.register(this._provider15); | ||
| unicode.register(this._provider15Graphemes); | ||
| this._oldVersion = unicode.activeVersion; | ||
| unicode.activeVersion = '15-graphemes'; | ||
| } | ||
|
|
||
| public dispose(): void { | ||
| if (this._unicode) { | ||
| this._unicode.activeVersion = this._oldVersion; | ||
| } | ||
| } | ||
| } |
Uh oh!
There was an error while loading. Please reload this page.