[METRICS] Fix tokenization issue of CJK languages for evaluation (#20)

owaski · web-flow · commit 93c51b4dcdc5 · 2026-02-27T11:58:23.000+01:00
For CJK languages, we need to tokenize them with `CJSegmenter` before
sending them to `mweralign.align_texts`.

This PR makes the following modifications:
1. Apply `CJSegmenter` before calling `mweralign.align_texts`. This is
done for both latency scorer and quality scorer.
2. Add `latency_unit` argument to the quality scorer and use this
argument to trigger `CJSegmenter` in the quality scorer.
diff --git a/simulstream/metrics/score_quality.py b/simulstream/metrics/score_quality.py
@@ -151,6 +151,9 @@ def cli_main():
     parser.add_argument(
         "--audio-definition", "-a", type=str, default=None,
         help="Path to the yaml file containing the segment-level audio information.")
+    parser.add_argument(
+        "--latency-unit", choices=["char", "word"], default="word",
+        help="Whether to computed stats based on words or characters. Default: word.")
     parser.add_argument("--scorer", choices=QUALITY_SCORER_REGISTRY.keys(), required=True)
     args, _ = parser.parse_known_args()
 
diff --git a/simulstream/metrics/scorers/latency/mwersegmenter.py b/simulstream/metrics/scorers/latency/mwersegmenter.py
@@ -17,6 +17,7 @@
 from typing import List
 
 from mweralign import mweralign
+from mweralign.segmenter import CJSegmenter
 
 from simulstream.metrics.readers import ReferenceSentenceDefinition, OutputWithDelays, text_items
 from simulstream.metrics.scorers.latency import LatencyScorer, LatencyScoringSample, LatencyScores
@@ -58,6 +59,7 @@ class MWERSegmenterBasedLatencyScorer(LatencyScorer):
     def __init__(self, args):
         super().__init__(args)
         self.latency_unit = args.latency_unit
+        self.segmenter = CJSegmenter() if args.latency_unit == "char" else None
 
     def requires_reference(self) -> bool:
         return True
@@ -101,19 +103,50 @@ def _split_delays_by_segmented_text(
             f"Index {index} should have reached end of delays ({len(delays)})"
         return segmented_delays
 
+    def _tokenize(self, text: List[str]) -> List[str]:
+        """
+        Tokenize text using the segmenter.
+
+        Borrowed from
+        https://github.com/mjpost/mweralign/blob/d23a5479/mweralign/mweralign.py#L147
+        """
+        if self.segmenter is not None:
+            tokenized_text = []
+            for i in range(len(text)):
+                if " ### " in text[i]:
+                    pieces = text[i].strip().split(" ### ")
+                    encoded = [" ".join(self.segmenter.encode(p)) for p in pieces]
+                    tokenized_text.append(" ### ".join(encoded))
+                elif "\t" in text[i]:
+                    pieces = text[i].strip().split("\t")
+                    # underlying C++ binary still uses ###
+                    encoded = [" ".join(self.segmenter.encode(p)) for p in pieces]
+                    tokenized_text.append(" ### ".join(encoded))
+                else:
+                    tokenized_text.append(" ".join(self.segmenter.encode(text[i].strip())))
+            return "\n".join(tokenized_text)
+        else:
+            return "\n".join(text)
+
     def score(self, samples: List[LatencyScoringSample]) -> LatencyScores:
         resegmented_samples = []
         for sample in samples:
             assert sample.reference is not None, "Cannot realign hypothesis to missing reference"
 
-            resegmented_hypos = mweralign.align_texts(
-                "\n".join([sentence_def.content for sentence_def in sample.reference]),
-                sample.hypothesis.final_text).split("\n")
+            hypo = self._tokenize([sample.hypothesis.final_text])
+            refs = self._tokenize(
+                [sentence_def.content for sentence_def in sample.reference])
+            resegmented_hypos = mweralign.align_texts(refs, hypo).split("\n")
 
             assert len(resegmented_hypos) == len(sample.reference), \
                 f"Reference ({sample.audio_name}) has mismatched number of target " \
                 f"({len(sample.reference)}) and resegmented lines ({len(resegmented_hypos)})"
 
+            if self.segmenter is not None:
+                # segmenter.decode will strip() the spaces, but we need them to align with delays
+                resegmented_hypos = [
+                    hypo.replace(" ", "").replace("_", " ") for hypo in resegmented_hypos]
+
             ideal_delays_splits = self._split_delays_by_segmented_text(
                 sample.hypothesis.ideal_delays,
                 resegmented_hypos)
diff --git a/simulstream/metrics/scorers/quality/mwersegmenter.py b/simulstream/metrics/scorers/quality/mwersegmenter.py
@@ -17,6 +17,7 @@
 from typing import List, Optional
 
 from mweralign import mweralign
+from mweralign.segmenter import CJSegmenter
 
 from simulstream.metrics.scorers.quality import QualityScorer, QualityScoringSample
 
@@ -56,6 +57,11 @@ class MWERSegmenterBasedQualityScorer(QualityScorer):
         ...         # Compute a custom quality score
         ...         return ...
     """
+
+    def __init__(self, args):
+        super().__init__(args)
+        self.segmenter = CJSegmenter() if args.latency_unit == "char" else None
+
     def requires_reference(self) -> bool:
         return True
 
@@ -75,15 +81,48 @@ def _do_score(self, samples: List[ResegmentedQualityScoringSample]) -> float:
         """
         ...
 
+    def _tokenize(self, text: List[str]) -> List[str]:
+        """
+        Tokenize text using the segmenter.
+
+        Borrowed from
+        https://github.com/mjpost/mweralign/blob/d23a5479/mweralign/mweralign.py#L147
+        """
+        if self.segmenter is not None:
+            tokenized_text = []
+            for i in range(len(text)):
+                if " ### " in text[i]:
+                    pieces = text[i].strip().split(" ### ")
+                    encoded = [" ".join(self.segmenter.encode(p)) for p in pieces]
+                    tokenized_text.append(" ### ".join(encoded))
+                elif "\t" in text[i]:
+                    pieces = text[i].strip().split("\t")
+                    # underlying C++ binary still uses ###
+                    encoded = [" ".join(self.segmenter.encode(p)) for p in pieces]
+                    tokenized_text.append(" ### ".join(encoded))
+                else:
+                    tokenized_text.append(" ".join(self.segmenter.encode(text[i].strip())))
+            return "\n".join(tokenized_text)
+        else:
+            return "\n".join(text)
+
     def score(self, samples: List[QualityScoringSample]) -> float:
         resegmented_samples = []
         for sample in samples:
             assert sample.reference is not None, "Cannot realign hypothesis to missing reference"
-            resegmented_hypos = mweralign.align_texts(
-                "\n".join(sample.reference), sample.hypothesis).split("\n")
+            hypo = self._tokenize([sample.hypothesis])
+            refs = self._tokenize(sample.reference)
+            resegmented_hypos = mweralign.align_texts(refs, hypo).split("\n")
+
             assert len(sample.reference) == len(resegmented_hypos), \
                 f"Reference ({sample.audio_name}) has mismatched number of target " \
                 f"({len(sample.reference)}) and resegmented lines ({len(resegmented_hypos)})"
+
+            if self.segmenter is not None:
+                # segmenter.decode will strip() the spaces, but we need them to align with delays
+                resegmented_hypos = [
+                    hypo.replace(" ", "").replace("_", " ") for hypo in resegmented_hypos]
+
             resegmented_samples.append(ResegmentedQualityScoringSample(
                 sample.audio_name,
                 resegmented_hypos,
diff --git a/uts/metrics/test_stream_laal.py b/uts/metrics/test_stream_laal.py
@@ -0,0 +1,91 @@
+# Copyright 2026 FBK
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import unittest
+from argparse import Namespace
+
+from simulstream.metrics.readers import OutputWithDelays, ReferenceSentenceDefinition
+from simulstream.metrics.scorers.latency import LatencyScoringSample
+from simulstream.metrics.scorers.latency.stream_laal import StreamLaal
+
+
+class StreamLaalTestCase(unittest.TestCase):
+    def test_basic(self):
+        reference = [
+            ReferenceSentenceDefinition(
+                "A New York, sono a capo di un'associazione no profit, chiamata Robin Hood.",
+                12.61,
+                4.07,
+            ),
+            ReferenceSentenceDefinition(
+                "Quando non combatto la povertà, combatto gli incendi come assistente capitano di "
+                "una brigata di pompieri volontari.",
+                16.9,
+                5.14,
+            )
+        ]
+        hypothesis = OutputWithDelays(
+            "Tornando a New York, sono il capo dello sviluppo per un non-profit chiamato Robin "
+            "Hood. Quando non sto combattendo la povertà, sto combattendo i fuochi.",
+            [14.0, 14.0, 14.0, 14.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 18.0,
+             18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 20.0, 20.0, 20.0, 20.0],
+            [18.22, 18.22, 18.22, 18.22, 19.93, 19.93, 19.93, 19.93, 19.93, 19.93, 19.93, 19.93,
+             19.93, 23.01, 23.01, 23.01, 23.01, 23.01, 23.01, 23.01, 23.01, 27.30, 27.30, 27.30,
+             27.30,]
+        )
+        scorer = StreamLaal(Namespace(latency_unit="word"))
+        score = scorer.score([LatencyScoringSample("a", hypothesis, reference)])
+        self.assertAlmostEqual(score.ideal_latency, 0.868587, 4)
+        self.assertAlmostEqual(score.computational_aware_latency, 5.86, 4)
+
+    def test_with_characters(self):
+        reference = [
+            ReferenceSentenceDefinition(
+                "今天她看起很好，",
+                12.61,
+                3.07,
+            ),
+            ReferenceSentenceDefinition(
+                "我们一起去公园散步吧。",
+                16.9,
+                3.14,
+            ),
+            ReferenceSentenceDefinition(
+                "Amy",
+                21.0,
+                0.5,
+            ),
+            ReferenceSentenceDefinition(
+                "今天心情很好",
+                21.5,
+                2.0,
+            ),
+        ]
+        hypothesis = OutputWithDelays(
+            "今天她很漂亮，我们一起去花园跑步吧。Amy 今天心情很好",
+            [14.0, 14.0, 14.0, 15.0, 15.0, 16.0, 17.0,
+             17.0, 17.0, 18.0, 18.0, 19.0, 19.0, 20.0, 20.0, 21.0, 21.0, 21.0,
+             22.0, 22.0, 22.0, 22.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0],
+            [14.5, 14.5, 14.5, 15.2, 15.2, 16.8, 17.5,
+             18.0, 18.5, 18.5, 18.5, 20.1, 20.1, 21.3, 21.3, 22.0, 22.0, 22.0,
+             23.0, 23.0, 23.0, 23.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0],
+        )
+        scorer = StreamLaal(Namespace(latency_unit="char"))
+        score = scorer.score([LatencyScoringSample("a", hypothesis, reference)])
+        self.assertAlmostEqual(score.ideal_latency, 1.333312, 4)
+        self.assertAlmostEqual(score.computational_aware_latency, 2.074095, 4)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/uts/metrics/test_tokenize_no_inplace.py b/uts/metrics/test_tokenize_no_inplace.py
@@ -0,0 +1,124 @@
+# Copyright 2026 FBK
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import copy
+import unittest
+from argparse import Namespace
+
+from simulstream.metrics.scorers.quality.mwersegmenter import (
+    MWERSegmenterBasedQualityScorer,
+)
+from simulstream.metrics.scorers.latency.mwersegmenter import (
+    MWERSegmenterBasedLatencyScorer,
+)
+from simulstream.metrics.scorers.latency import LatencyScores
+
+
+class TokenizeNoInplaceModificationTestCase(unittest.TestCase):
+    """
+    Ensures that _tokenize does not alter the references.
+    See https://github.com/hlt-mt/simulstream/pull/20#issuecomment-3960951980
+    """
+
+    def _make_quality_scorer(self, latency_unit="char"):
+        """Create a concrete subclass of the abstract quality scorer."""
+        class _Scorer(MWERSegmenterBasedQualityScorer):
+            def _do_score(self, samples):
+                return 0.0
+
+            @classmethod
+            def add_arguments(cls, parser):
+                pass
+
+            def requires_source(self):
+                return False
+
+        args = Namespace(latency_unit=latency_unit)
+        return _Scorer(args)
+
+    def _make_latency_scorer(self, latency_unit="char"):
+        """Create a concrete subclass of the abstract latency scorer."""
+        class _Scorer(MWERSegmenterBasedLatencyScorer):
+            def _do_score(self, samples):
+                return LatencyScores(0.0, [])
+
+            @classmethod
+            def add_arguments(cls, parser):
+                pass
+
+            def requires_source(self):
+                return False
+
+        args = Namespace(latency_unit=latency_unit)
+        return _Scorer(args)
+
+    def test_quality_tokenize_does_not_modify_input(self):
+        scorer = self._make_quality_scorer(latency_unit="char")
+        text = ["你好世界", "这是测试"]
+        original = copy.deepcopy(text)
+        scorer._tokenize(text)
+        self.assertEqual(text, original)
+
+    def test_latency_tokenize_does_not_modify_input(self):
+        scorer = self._make_latency_scorer(latency_unit="char")
+        text = ["你好世界", "这是测试"]
+        original = copy.deepcopy(text)
+        scorer._tokenize(text)
+        self.assertEqual(text, original)
+
+    def test_quality_tokenize_no_modify_with_separator(self):
+        scorer = self._make_quality_scorer(latency_unit="char")
+        text = ["你好 ### 世界"]
+        original = copy.deepcopy(text)
+        scorer._tokenize(text)
+        self.assertEqual(text, original)
+
+    def test_quality_tokenize_no_modify_with_tab(self):
+        scorer = self._make_quality_scorer(latency_unit="char")
+        text = ["你好\t世界"]
+        original = copy.deepcopy(text)
+        scorer._tokenize(text)
+        self.assertEqual(text, original)
+
+    def test_quality_tokenize_does_not_modify_input_english(self):
+        scorer = self._make_quality_scorer(latency_unit="word")
+        text = ["hello world", "this is a test"]
+        original = copy.deepcopy(text)
+        scorer._tokenize(text)
+        self.assertEqual(text, original)
+
+    def test_latency_tokenize_does_not_modify_input_english(self):
+        scorer = self._make_latency_scorer(latency_unit="word")
+        text = ["hello world", "this is a test"]
+        original = copy.deepcopy(text)
+        scorer._tokenize(text)
+        self.assertEqual(text, original)
+
+    def test_quality_tokenize_no_modify_with_separator_english(self):
+        scorer = self._make_quality_scorer(latency_unit="word")
+        text = ["hello ### world"]
+        original = copy.deepcopy(text)
+        scorer._tokenize(text)
+        self.assertEqual(text, original)
+
+    def test_quality_tokenize_no_modify_with_tab_english(self):
+        scorer = self._make_quality_scorer(latency_unit="word")
+        text = ["hello\tworld"]
+        original = copy.deepcopy(text)
+        scorer._tokenize(text)
+        self.assertEqual(text, original)
+
+
+if __name__ == '__main__':
+    unittest.main()