Fix some bugs in the diffWords regex (and errors & ambiguities in the comment above it) (#635)

ExplodingCabbage · web-flow · commit ad6dc1728e52 · 2025-10-08T16:55:23.000+01:00
diff --git a/release-notes.md b/release-notes.md
@@ -3,6 +3,7 @@
 ## Future 8.0.3 release
 
 - [#631](https://github.com/kpdecker/jsdiff/pull/631) - **fix support for using an `Intl.Segmenter` with `diffWords`**. This has been almost completely broken since the feature was added in v6.0.0, since it would outright crash on any text that featured two consecutive newlines between a pair of words (a very common case).
+- [#635](https://github.com/kpdecker/jsdiff/pull/635) - **small tweaks to tokenization behaviour of `diffWords`** when used *without* an `Intl.Segmenter`. Specifically, the soft hyphen (U+00AD) is no longer considered to be a word break, and the multiplication and division signs (`×` and `÷`) are now treated as punctuation instead of as letters / word characters.
 
 ## 8.0.2
 
diff --git a/src/diff/word.ts b/src/diff/word.ts
@@ -4,23 +4,25 @@ import { longestCommonPrefix, longestCommonSuffix, replacePrefix, replaceSuffix,
 
 // Based on https://en.wikipedia.org/wiki/Latin_script_in_Unicode
 //
-// Ranges and exceptions:
-// Latin-1 Supplement, 0080–00FF
-//  - U+00D7  × Multiplication sign
-//  - U+00F7  ÷ Division sign
-// Latin Extended-A, 0100–017F
-// Latin Extended-B, 0180–024F
-// IPA Extensions, 0250–02AF
-// Spacing Modifier Letters, 02B0–02FF
-//  - U+02C7  ˇ &#711;  Caron
-//  - U+02D8  ˘ &#728;  Breve
-//  - U+02D9  ˙ &#729;  Dot Above
-//  - U+02DA  ˚ &#730;  Ring Above
-//  - U+02DB  ˛ &#731;  Ogonek
-//  - U+02DC  ˜ &#732;  Small Tilde
-//  - U+02DD  ˝ &#733;  Double Acute Accent
-// Latin Extended Additional, 1E00–1EFF
-const extendedWordChars = 'a-zA-Z0-9_\\u{C0}-\\u{FF}\\u{D8}-\\u{F6}\\u{F8}-\\u{2C6}\\u{2C8}-\\u{2D7}\\u{2DE}-\\u{2FF}\\u{1E00}-\\u{1EFF}';
+// Chars/ranges counted as "word" characters by this regex are as follows:
+//
+// + U+00AD  Soft hyphen
+// + 00C0–00FF (letters with diacritics from the Latin-1 Supplement), except:
+//   - U+00D7  × Multiplication sign
+//   - U+00F7  ÷ Division sign
+// + Latin Extended-A, 0100–017F
+// + Latin Extended-B, 0180–024F
+// + IPA Extensions, 0250–02AF
+// + Spacing Modifier Letters, 02B0–02FF, except:
+//   - U+02C7  ˇ &#711;  Caron
+//   - U+02D8  ˘ &#728;  Breve
+//   - U+02D9  ˙ &#729;  Dot Above
+//   - U+02DA  ˚ &#730;  Ring Above
+//   - U+02DB  ˛ &#731;  Ogonek
+//   - U+02DC  ˜ &#732;  Small Tilde
+//   - U+02DD  ˝ &#733;  Double Acute Accent
+// + Latin Extended Additional, 1E00–1EFF
+const extendedWordChars = 'a-zA-Z0-9_\\u{AD}\\u{C0}-\\u{D6}\\u{D8}-\\u{F6}\\u{F8}-\\u{2C6}\\u{2C8}-\\u{2D7}\\u{2DE}-\\u{2FF}\\u{1E00}-\\u{1EFF}';
 
 // Each token is one of the following:
 // - A punctuation mark plus the surrounding whitespace
diff --git a/test/diff/word.js b/test/diff/word.js
@@ -59,6 +59,89 @@ describe('WordDiff', function() {
         '.'
       ]);
     });
+
+    // Test for various behaviours discussed at
+    // https://github.com/kpdecker/jsdiff/issues/634#issuecomment-3381707327
+    // In particular we are testing that:
+    // 1. single code points representing accented characters (most of range
+    //    U+00C0 thru U+00FF) are treated as word characters
+    // 2. soft hyphens are treated as part of the word they appear in
+    // 3. the multiplication and division signs are punctuation
+    // 4. currency signs are punctuation
+    // 5. section symbol is punctuation
+    // 6. reserved trademark symbol is punctuation
+    // 7. fractions are punctuation
+    // The behaviour being tested for in points 4 thru 7 above is of debatable
+    // correctness; it is not totally obvious whether we SHOULD treat those
+    // things as punctuation characters or as word characters. Nonetheless, we
+    // have this test to help document the current behaviour.
+    it('should handle the 0080-00FF range the way we expect', () => {
+      expect(
+        wordDiff.tokenize(
+          'My daugh\u00adter, Am\u00E9lie, is 1½ years old and works for ' +
+            'Google® for £6 per hour (equivalently £6÷60=£0.10 per minute, or ' +
+            '£6×8=£48 per day), in violation of § 123 of the Child Labour Act.'
+        )
+      ).to.deep.equal([
+        'My ',
+        ' daugh\u00adter',
+        ', ',
+        ' Am\u00E9lie',
+        ', ',
+        ' is ',
+        ' 1',
+        '½ ',
+        ' years ',
+        ' old ',
+        ' and ',
+        ' works ',
+        ' for ',
+        ' Google',
+        '® ',
+        ' for ',
+        ' £',
+        '6 ',
+        ' per ',
+        ' hour ',
+        ' (',
+        'equivalently ',
+        ' £',
+        '6',
+        '÷',
+        '60',
+        '=',
+        '£',
+        '0',
+        '.',
+        '10 ',
+        ' per ',
+        ' minute',
+        ', ',
+        ' or ',
+        ' £',
+        '6',
+        '×',
+        '8',
+        '=',
+        '£',
+        '48 ',
+        ' per ',
+        ' day',
+        ')',
+        ', ',
+        ' in ',
+        ' violation ',
+        ' of ',
+        ' § ',
+        ' 123 ',
+        ' of ',
+        ' the ',
+        ' Child ',
+        ' Labour ',
+        ' Act',
+        '.'
+      ]);
+    });
   });
 
   describe('#diffWords', function() {