Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

This is the changelog for the open source version of tiktoken.

## [Unreleased]
- Fix stack overflow error when encoding very long repetitive strings by optimizing BPE algorithm for memory efficiency

## [v0.9.0]
- Support for `o1` and `o3` models
- Better error messages when loading invalid vocabulary files
Expand Down
170 changes: 170 additions & 0 deletions benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
import time
import statistics
import tiktoken

def benchmark_encoding(text, encoding_name="cl100k_base", runs=5):
"""Run encoding benchmark multiple times and return statistics."""
encoder = tiktoken.get_encoding(encoding_name)
times = []

for i in range(runs):
start_time = time.time()
tokens = encoder.encode(text)
end_time = time.time()
times.append(end_time - start_time)

return {
'token_count': len(tokens),
'avg_time': sum(times) / len(times),
'min_time': min(times),
'max_time': max(times),
'median_time': statistics.median(times),
'times': times
}

# Test cases
test_cases = [
# Regular English text
{
'name': 'English Essay',
'text': '''
The quick brown fox jumps over the lazy dog. This sentence contains every letter of the English alphabet.
Natural language processing has evolved significantly over the past decade. With the advent of transformer models,
we've seen remarkable improvements in various NLP tasks such as translation, summarization, and question answering.
These models leverage attention mechanisms to process text in parallel rather than sequentially, allowing for more
efficient training and inference. While traditional recurrent neural networks (RNNs) and long short-term memory (LSTM)
networks process text one token at a time, transformers can attend to all tokens simultaneously, capturing long-range
dependencies more effectively. The development of pre-training techniques has also contributed to the success of
modern NLP systems. By pre-training models on large corpora of text, researchers have been able to develop models
that possess a form of general language understanding, which can then be fine-tuned for specific downstream tasks.
This approach has led to the creation of models like BERT, GPT, and T5, each with their own unique architecture
and capabilities. Despite these advances, challenges remain in the field of NLP. Issues such as bias in training
data, the environmental impact of training large models, and the need for more efficient architectures continue
to drive research in this area. As we move forward, it's likely that we'll see a continued focus on developing
models that are not only more powerful but also more efficient and ethical.
''' * 100 # Repeat to make it longer
},

# Code
{
'name': 'Python Code',
'text': '''
def quicksort(arr):
"""
Implement quicksort algorithm to sort an array

Args:
arr: List of comparable elements

Returns:
Sorted list
"""
if len(arr) <= 1:
return arr

pivot = arr[len(arr) // 2]
left = [x for x in arr if x < pivot]
middle = [x for x in arr if x == pivot]
right = [x for x in arr if x > pivot]

return quicksort(left) + middle + quicksort(right)

def merge_sort(arr):
"""
Implement merge sort algorithm

Args:
arr: List of comparable elements

Returns:
Sorted list
"""
if len(arr) <= 1:
return arr

mid = len(arr) // 2
left = merge_sort(arr[:mid])
right = merge_sort(arr[mid:])

return merge(left, right)

def merge(left, right):
"""Merge two sorted arrays"""
result = []
i = j = 0

while i < len(left) and j < len(right):
if left[i] <= right[j]:
result.append(left[i])
i += 1
else:
result.append(right[j])
j += 1

result.extend(left[i:])
result.extend(right[j:])
return result
''' * 50 # Repeat to make it longer
},

# Mixed content with some repetition but not excessive
{
'name': 'Mixed Content',
'text': '''
<html>
<head>
<title>Sample Document</title>
</head>
<body>
<h1>Welcome to the Sample Page</h1>
<p>This is a paragraph with some text. It also contains some repeated content like:</p>
<ul>
<li>Item one</li>
<li>Item two</li>
<li>Item three</li>
</ul>
<p>And here is a table:</p>
<table>
<tr><td>Row 1, Col 1</td><td>Row 1, Col 2</td></tr>
<tr><td>Row 2, Col 1</td><td>Row 2, Col 2</td></tr>
<tr><td>Row 3, Col 1</td><td>Row 3, Col 2</td></tr>
</table>
<div class="repeated">
This div has some moderately repeated content.
This div has some moderately repeated content.
This div has some moderately repeated content.
</div>
<p>
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud
exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
</p>
</body>
</html>
''' * 30 # Repeat to make it longer
},

# Moderate repetition (but not pathological)
{
'name': 'Moderate Repetition',
'text': 'The word "repeated" is repeated repeatedly in this repeated text with repeated repetition of the repeated word "repeated". ' * 100
}
]

print("\nRunning benchmarks on various text types...\n")

for test in test_cases:
print(f"Testing: {test['name']}")
print(f"Text length: {len(test['text'])} characters")

results = benchmark_encoding(test['text'])

print(f"Token count: {results['token_count']}")
print(f"Average time: {results['avg_time']:.4f} seconds")
print(f"Min time: {results['min_time']:.4f} seconds")
print(f"Max time: {results['max_time']:.4f} seconds")
print(f"Median time: {results['median_time']:.4f} seconds")
print(f"Tokens per second: {results['token_count'] / results['avg_time']:.2f}")
print("\n" + "-"*80 + "\n")

print("Benchmarks completed!")
48 changes: 48 additions & 0 deletions benchmark_modified.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@

Running benchmarks on various text types...

Testing: English Essay
Text length: 167600 characters
Token count: 30801
Average time: 0.0213 seconds
Min time: 0.0186 seconds
Max time: 0.0290 seconds
Median time: 0.0191 seconds
Tokens per second: 1447885.67

--------------------------------------------------------------------------------

Testing: Python Code
Text length: 59100 characters
Token count: 14452
Average time: 0.0118 seconds
Min time: 0.0115 seconds
Max time: 0.0125 seconds
Median time: 0.0115 seconds
Tokens per second: 1220145.20

--------------------------------------------------------------------------------

Testing: Mixed Content
Text length: 38250 characters
Token count: 9301
Average time: 0.0095 seconds
Min time: 0.0074 seconds
Max time: 0.0176 seconds
Median time: 0.0075 seconds
Tokens per second: 980462.28

--------------------------------------------------------------------------------

Testing: Moderate Repetition
Text length: 12300 characters
Token count: 2401
Average time: 0.0012 seconds
Min time: 0.0012 seconds
Max time: 0.0012 seconds
Median time: 0.0012 seconds
Tokens per second: 1976783.12

--------------------------------------------------------------------------------

Benchmarks completed!
48 changes: 48 additions & 0 deletions benchmark_original.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@

Running benchmarks on various text types...

Testing: English Essay
Text length: 167600 characters
Token count: 30801
Average time: 0.0228 seconds
Min time: 0.0185 seconds
Max time: 0.0386 seconds
Median time: 0.0186 seconds
Tokens per second: 1350112.53

--------------------------------------------------------------------------------

Testing: Python Code
Text length: 59100 characters
Token count: 14452
Average time: 0.0160 seconds
Min time: 0.0116 seconds
Max time: 0.0318 seconds
Median time: 0.0122 seconds
Tokens per second: 904337.00

--------------------------------------------------------------------------------

Testing: Mixed Content
Text length: 38250 characters
Token count: 9301
Average time: 0.0075 seconds
Min time: 0.0074 seconds
Max time: 0.0075 seconds
Median time: 0.0075 seconds
Tokens per second: 1246922.63

--------------------------------------------------------------------------------

Testing: Moderate Repetition
Text length: 12300 characters
Token count: 2401
Average time: 0.0012 seconds
Min time: 0.0012 seconds
Max time: 0.0012 seconds
Median time: 0.0012 seconds
Tokens per second: 1987786.49

--------------------------------------------------------------------------------

Benchmarks completed!
68 changes: 68 additions & 0 deletions compare_original_vs_fixed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import tempfile
import os
import subprocess
import tiktoken
import time

def run_test_with_code_version(version, code_to_test):
"""Run specified test code with a specific version of the Rust code."""
# Create temp file to store current lib.rs
with tempfile.NamedTemporaryFile(suffix='.rs', delete=False) as temp:
temp_path = temp.name

# Backup current lib.rs
subprocess.run("cp src/lib.rs " + temp_path, shell=True)

try:
if version == "original":
# Checkout original version
subprocess.run("git checkout origin/main -- src/lib.rs", shell=True)
else:
# Our current version is already the fixed one
pass

# Build the code
subprocess.run("pip install -e . > /dev/null", shell=True)

# Run the test code
start = time.time()
result = eval(code_to_test)
end = time.time()

return result, end - start
finally:
# Restore our version
subprocess.run("cp " + temp_path + " src/lib.rs", shell=True)
os.unlink(temp_path)
subprocess.run("pip install -e . > /dev/null", shell=True)

# Compare tokenization of a few test cases using both original and fixed code
test_cases = [
("X" * 1000),
("X" * 10000),
("X" * 50000),
("Hello world! " + ("X" * 1000) + " This is a test."),
("ABC" * 1000)
]

print("Comparing tokenization between original and fixed code:")
print("------------------------------------------------------")

for i, text in enumerate(test_cases):
print(f"Test case #{i+1}:")

# Get tokens with original code
code = "len(tiktoken.get_encoding('cl100k_base').encode(text))"
orig_tokens, orig_time = run_test_with_code_version("original", code)

# Get tokens with fixed code
fixed_tokens, fixed_time = run_test_with_code_version("fixed", code)

print(f" Original: {orig_tokens} tokens in {orig_time:.4f} seconds")
print(f" Fixed: {fixed_tokens} tokens in {fixed_time:.4f} seconds")

if orig_tokens == fixed_tokens:
print(" ✓ Tokenizations match!")
else:
print(f" ✗ Tokenizations differ! Original: {orig_tokens}, Fixed: {fixed_tokens}")
print()
17 changes: 17 additions & 0 deletions compare_repetitive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import time
import tiktoken

def test_repetitive_encoding(length):
text = "X" * length
encoder = tiktoken.get_encoding("cl100k_base")

start_time = time.time()
tokens = encoder.encode(text)
end_time = time.time()

print(f"Encoded {length} 'X's into {len(tokens)} tokens")
print(f"Time taken: {end_time - start_time:.4f} seconds")
print(f"Tokens per second: {len(tokens) / (end_time - start_time):.2f}")

print("Testing with medium repetitive string (50,000 Xs)")
test_repetitive_encoding(50000)
29 changes: 29 additions & 0 deletions compare_tokenization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import tiktoken

def test_with_repetitive_strings():
"""Test different sizes of repetitive strings and verify token counts match."""
print("Testing tokenization equivalence:")

for length in [1000, 10000, 50000, 100000]:
text = "X" * length
encoder = tiktoken.get_encoding("cl100k_base")
tokens = encoder.encode(text)
print(f"Length {length} -> {len(tokens)} tokens")

# Check the ratio
# Note: For most strings, the tokens should be about 1/8 the length
# for a single repeated character because bytes are merged efficiently
print(f" Character-to-token ratio: {length / len(tokens):.2f}")

# Also try with some mixed content
text = "Hello world! " + ("X" * 10000) + " This is a test."
tokens = encoder.encode(text)
print(f"Mixed content (mostly repetitive): {len(tokens)} tokens")

# Try with a more complex repetitive pattern
text = ("ABC" * 10000)
tokens = encoder.encode(text)
print(f"'ABC' repeated 10000 times: {len(tokens)} tokens")

# Run the tests
test_with_repetitive_strings()
Loading