AbanteAI · mentatbot · Mar 21, 2025 · Mar 21, 2025 · Mar 21, 2025 · Mar 21, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 This is the changelog for the open source version of tiktoken.
 
+## [Unreleased]
+- Fix stack overflow error when encoding very long repetitive strings by optimizing BPE algorithm for memory efficiency
+
 ## [v0.9.0]
 - Support for `o1` and `o3` models
 - Better error messages when loading invalid vocabulary files

diff --git a/benchmark.py b/benchmark.py
@@ -0,0 +1,170 @@
+import time
+import statistics
+import tiktoken
+
+def benchmark_encoding(text, encoding_name="cl100k_base", runs=5):
+    """Run encoding benchmark multiple times and return statistics."""
+    encoder = tiktoken.get_encoding(encoding_name)
+    times = []
+
+    for i in range(runs):
+        start_time = time.time()
+        tokens = encoder.encode(text)
+        end_time = time.time()
+        times.append(end_time - start_time)
+
+    return {
+        'token_count': len(tokens),
+        'avg_time': sum(times) / len(times),
+        'min_time': min(times),
+        'max_time': max(times),
+        'median_time': statistics.median(times),
+        'times': times
+    }
+
+# Test cases
+test_cases = [
+    # Regular English text
+    {
+        'name': 'English Essay',
+        'text': '''
+        The quick brown fox jumps over the lazy dog. This sentence contains every letter of the English alphabet.
+        Natural language processing has evolved significantly over the past decade. With the advent of transformer models,
+        we've seen remarkable improvements in various NLP tasks such as translation, summarization, and question answering.
+        These models leverage attention mechanisms to process text in parallel rather than sequentially, allowing for more
+        efficient training and inference. While traditional recurrent neural networks (RNNs) and long short-term memory (LSTM)
+        networks process text one token at a time, transformers can attend to all tokens simultaneously, capturing long-range
+        dependencies more effectively. The development of pre-training techniques has also contributed to the success of
+        modern NLP systems. By pre-training models on large corpora of text, researchers have been able to develop models
+        that possess a form of general language understanding, which can then be fine-tuned for specific downstream tasks.
+        This approach has led to the creation of models like BERT, GPT, and T5, each with their own unique architecture
+        and capabilities. Despite these advances, challenges remain in the field of NLP. Issues such as bias in training
+        data, the environmental impact of training large models, and the need for more efficient architectures continue
+        to drive research in this area. As we move forward, it's likely that we'll see a continued focus on developing
+        models that are not only more powerful but also more efficient and ethical.
+        ''' * 100  # Repeat to make it longer
+    },
+
+    # Code
+    {
+        'name': 'Python Code',
+        'text': '''
+def quicksort(arr):
+    """
+    Implement quicksort algorithm to sort an array
+
+    Args:
+        arr: List of comparable elements
+
+    Returns:
+        Sorted list
+    """
+    if len(arr) <= 1:
+        return arr
+
+    pivot = arr[len(arr) // 2]
+    left = [x for x in arr if x < pivot]
+    middle = [x for x in arr if x == pivot]
+    right = [x for x in arr if x > pivot]
+
+    return quicksort(left) + middle + quicksort(right)
+
+def merge_sort(arr):
+    """
+    Implement merge sort algorithm
+
+    Args:
+        arr: List of comparable elements
+
+    Returns:
+        Sorted list
+    """
+    if len(arr) <= 1:
+        return arr
+
+    mid = len(arr) // 2
+    left = merge_sort(arr[:mid])
+    right = merge_sort(arr[mid:])
+
+    return merge(left, right)
+
+def merge(left, right):
+    """Merge two sorted arrays"""
+    result = []
+    i = j = 0
+
+    while i < len(left) and j < len(right):
+        if left[i] <= right[j]:
+            result.append(left[i])
+            i += 1
+        else:
+            result.append(right[j])
+            j += 1
+
+    result.extend(left[i:])
+    result.extend(right[j:])
+    return result
+        ''' * 50  # Repeat to make it longer
+    },
+
+    # Mixed content with some repetition but not excessive
+    {
+        'name': 'Mixed Content',
+        'text': '''
+        <html>
+        <head>
+            <title>Sample Document</title>
+        </head>
+        <body>
+            <h1>Welcome to the Sample Page</h1>
+            <p>This is a paragraph with some text. It also contains some repeated content like:</p>
+            <ul>
+                <li>Item one</li>
+                <li>Item two</li>
+                <li>Item three</li>
+            </ul>
+            <p>And here is a table:</p>
+            <table>
+                <tr><td>Row 1, Col 1</td><td>Row 1, Col 2</td></tr>
+                <tr><td>Row 2, Col 1</td><td>Row 2, Col 2</td></tr>
+                <tr><td>Row 3, Col 1</td><td>Row 3, Col 2</td></tr>
+            </table>
+            <div class="repeated">
+                This div has some moderately repeated content.
+                This div has some moderately repeated content.
+                This div has some moderately repeated content.
+            </div>
+            <p>
+                Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor 
+                incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud 
+                exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
+            </p>
+        </body>
+        </html>
+        ''' * 30  # Repeat to make it longer
+    },
+
+    # Moderate repetition (but not pathological)
+    {
+        'name': 'Moderate Repetition',
+        'text': 'The word "repeated" is repeated repeatedly in this repeated text with repeated repetition of the repeated word "repeated". ' * 100
+    }
+]
+
+print("\nRunning benchmarks on various text types...\n")
+
+for test in test_cases:
+    print(f"Testing: {test['name']}")
+    print(f"Text length: {len(test['text'])} characters")
+
+    results = benchmark_encoding(test['text'])
+
+    print(f"Token count: {results['token_count']}")
+    print(f"Average time: {results['avg_time']:.4f} seconds")
+    print(f"Min time: {results['min_time']:.4f} seconds")
+    print(f"Max time: {results['max_time']:.4f} seconds")
+    print(f"Median time: {results['median_time']:.4f} seconds")
+    print(f"Tokens per second: {results['token_count'] / results['avg_time']:.2f}")
+    print("\n" + "-"*80 + "\n")
+
+print("Benchmarks completed!")
diff --git a/benchmark_modified.txt b/benchmark_modified.txt
@@ -0,0 +1,48 @@
+
+Running benchmarks on various text types...
+
+Testing: English Essay
+Text length: 167600 characters
+Token count: 30801
+Average time: 0.0213 seconds
+Min time: 0.0186 seconds
+Max time: 0.0290 seconds
+Median time: 0.0191 seconds
+Tokens per second: 1447885.67
+
+--------------------------------------------------------------------------------
+
+Testing: Python Code
+Text length: 59100 characters
+Token count: 14452
+Average time: 0.0118 seconds
+Min time: 0.0115 seconds
+Max time: 0.0125 seconds
+Median time: 0.0115 seconds
+Tokens per second: 1220145.20
+
+--------------------------------------------------------------------------------
+
+Testing: Mixed Content
+Text length: 38250 characters
+Token count: 9301
+Average time: 0.0095 seconds
+Min time: 0.0074 seconds
+Max time: 0.0176 seconds
+Median time: 0.0075 seconds
+Tokens per second: 980462.28
+
+--------------------------------------------------------------------------------
+
+Testing: Moderate Repetition
+Text length: 12300 characters
+Token count: 2401
+Average time: 0.0012 seconds
+Min time: 0.0012 seconds
+Max time: 0.0012 seconds
+Median time: 0.0012 seconds
+Tokens per second: 1976783.12
+
+--------------------------------------------------------------------------------
+
+Benchmarks completed!
diff --git a/benchmark_original.txt b/benchmark_original.txt
@@ -0,0 +1,48 @@
+
+Running benchmarks on various text types...
+
+Testing: English Essay
+Text length: 167600 characters
+Token count: 30801
+Average time: 0.0228 seconds
+Min time: 0.0185 seconds
+Max time: 0.0386 seconds
+Median time: 0.0186 seconds
+Tokens per second: 1350112.53
+
+--------------------------------------------------------------------------------
+
+Testing: Python Code
+Text length: 59100 characters
+Token count: 14452
+Average time: 0.0160 seconds
+Min time: 0.0116 seconds
+Max time: 0.0318 seconds
+Median time: 0.0122 seconds
+Tokens per second: 904337.00
+
+--------------------------------------------------------------------------------
+
+Testing: Mixed Content
+Text length: 38250 characters
+Token count: 9301
+Average time: 0.0075 seconds
+Min time: 0.0074 seconds
+Max time: 0.0075 seconds
+Median time: 0.0075 seconds
+Tokens per second: 1246922.63
+
+--------------------------------------------------------------------------------
+
+Testing: Moderate Repetition
+Text length: 12300 characters
+Token count: 2401
+Average time: 0.0012 seconds
+Min time: 0.0012 seconds
+Max time: 0.0012 seconds
+Median time: 0.0012 seconds
+Tokens per second: 1987786.49
+
+--------------------------------------------------------------------------------
+
+Benchmarks completed!
diff --git a/compare_original_vs_fixed.py b/compare_original_vs_fixed.py
@@ -0,0 +1,68 @@
+import tempfile
+import os
+import subprocess
+import tiktoken
+import time
+
+def run_test_with_code_version(version, code_to_test):
+    """Run specified test code with a specific version of the Rust code."""
+    # Create temp file to store current lib.rs
+    with tempfile.NamedTemporaryFile(suffix='.rs', delete=False) as temp:
+        temp_path = temp.name
+
+    # Backup current lib.rs
+    subprocess.run("cp src/lib.rs " + temp_path, shell=True)
+
+    try:
+        if version == "original":
+            # Checkout original version
+            subprocess.run("git checkout origin/main -- src/lib.rs", shell=True)
+        else:
+            # Our current version is already the fixed one
+            pass
+
+        # Build the code
+        subprocess.run("pip install -e . > /dev/null", shell=True)
+
+        # Run the test code
+        start = time.time()
+        result = eval(code_to_test)
+        end = time.time()
+
+        return result, end - start
+    finally:
+        # Restore our version
+        subprocess.run("cp " + temp_path + " src/lib.rs", shell=True)
+        os.unlink(temp_path)
+        subprocess.run("pip install -e . > /dev/null", shell=True)
+
+# Compare tokenization of a few test cases using both original and fixed code 
+test_cases = [
+    ("X" * 1000),
+    ("X" * 10000),
+    ("X" * 50000),
+    ("Hello world! " + ("X" * 1000) + " This is a test."),
+    ("ABC" * 1000)
+]
+
+print("Comparing tokenization between original and fixed code:")
+print("------------------------------------------------------")
+
+for i, text in enumerate(test_cases):
+    print(f"Test case #{i+1}:")
+
+    # Get tokens with original code
+    code = "len(tiktoken.get_encoding('cl100k_base').encode(text))"
+    orig_tokens, orig_time = run_test_with_code_version("original", code)
+
+    # Get tokens with fixed code 
+    fixed_tokens, fixed_time = run_test_with_code_version("fixed", code)
+
+    print(f"  Original: {orig_tokens} tokens in {orig_time:.4f} seconds")
+    print(f"  Fixed:    {fixed_tokens} tokens in {fixed_time:.4f} seconds") 
+
+    if orig_tokens == fixed_tokens:
+        print("  ✓ Tokenizations match!")
+    else:
+        print(f"  ✗ Tokenizations differ! Original: {orig_tokens}, Fixed: {fixed_tokens}")
+    print()
diff --git a/compare_repetitive.py b/compare_repetitive.py
@@ -0,0 +1,17 @@
+import time
+import tiktoken
+
+def test_repetitive_encoding(length):
+    text = "X" * length
+    encoder = tiktoken.get_encoding("cl100k_base")
+
+    start_time = time.time()
+    tokens = encoder.encode(text)
+    end_time = time.time()
+
+    print(f"Encoded {length} 'X's into {len(tokens)} tokens")
+    print(f"Time taken: {end_time - start_time:.4f} seconds")
+    print(f"Tokens per second: {len(tokens) / (end_time - start_time):.2f}")
+
+print("Testing with medium repetitive string (50,000 Xs)")
+test_repetitive_encoding(50000)
diff --git a/compare_tokenization.py b/compare_tokenization.py
@@ -0,0 +1,29 @@
+import tiktoken
+
+def test_with_repetitive_strings():
+    """Test different sizes of repetitive strings and verify token counts match."""
+    print("Testing tokenization equivalence:")
+
+    for length in [1000, 10000, 50000, 100000]:
+        text = "X" * length
+        encoder = tiktoken.get_encoding("cl100k_base")
+        tokens = encoder.encode(text)
+        print(f"Length {length} -> {len(tokens)} tokens")
+
+        # Check the ratio
+        # Note: For most strings, the tokens should be about 1/8 the length
+        # for a single repeated character because bytes are merged efficiently
+        print(f"  Character-to-token ratio: {length / len(tokens):.2f}")
+
+    # Also try with some mixed content
+    text = "Hello world! " + ("X" * 10000) + " This is a test."
+    tokens = encoder.encode(text)
+    print(f"Mixed content (mostly repetitive): {len(tokens)} tokens")
+
+    # Try with a more complex repetitive pattern
+    text = ("ABC" * 10000) 
+    tokens = encoder.encode(text)
+    print(f"'ABC' repeated 10000 times: {len(tokens)} tokens")
+
+# Run the tests
+test_with_repetitive_strings()