Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 137 additions & 1 deletion python/langsmith/client.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Client for interacting with the LangSmith API.

Use the client to customize API keys / workspace connections, SSL certs,
Expand Down Expand Up @@ -92,7 +92,7 @@
serialized_run_operation_to_multipart_parts_and_context,
)
from langsmith._internal._serde import dumps_json as _dumps_json
from langsmith.schemas import AttachmentInfo
from langsmith.schemas import AttachmentInfo, ExampleWithRuns


def _check_otel_enabled() -> bool:
Expand Down Expand Up @@ -8269,6 +8269,142 @@
**kwargs,
)

def _paginate_examples_with_runs(
self,
dataset_id: ID_TYPE,
session_id: uuid.UUID,
preview: bool = False,
comparative_experiment_id: Optional[uuid.UUID] = None,
filters: dict[uuid.UUID, list[str]] | None = None,
limit: Optional[int] = None,
) -> Iterator[list[ExampleWithRuns]]:
"""Paginate through examples with runs and yield batches.

Args:
dataset_id: Dataset UUID to fetch examples with runs
session_id: Session UUID to filter runs by, same as project_id
preview: Whether to return preview data only
comparative_experiment_id: Optional comparative experiment UUID
filters: Optional filters to apply
limit: Maximum total number of results to return

Yields:
Batches of run results as lists of ExampleWithRuns instances
"""
offset = 0
results_count = 0

while True:
remaining = (limit - results_count) if limit else None
batch_limit = min(100, remaining) if remaining else 100

body = {
"session_ids": [session_id],
"offset": offset,
"limit": batch_limit,
"preview": preview,
"comparative_experiment_id": comparative_experiment_id,
"filters": filters,
}

response = self.request_with_retries(
"POST",
f"/datasets/{dataset_id}/runs",
request_kwargs={"data": _dumps_json(body)},
)

batch = response.json()
if not batch:
break

# Transform raw dictionaries to ExampleWithRuns instances
examples_batch = [ls_schemas.ExampleWithRuns(**result) for result in batch]
yield examples_batch
results_count += len(batch)

if len(batch) < batch_limit or (limit and results_count >= limit):
break

offset += len(batch)

def get_experiment_results(
self,
name: Optional[str] = None,
project_id: Optional[uuid.UUID] = None,
preview: bool = False,
comparative_experiment_id: Optional[uuid.UUID] = None,
filters: dict[uuid.UUID, list[str]] | None = None,
limit: Optional[int] = None,
) -> ls_schemas.ExperimentResults:
"""Get results for an experiment, including experiment session aggregated stats and experiment runs for each dataset example.

Experiment results may not be available immediately after the experiment is created.

Args:
name: The experiment name.
project_id: Experiment's tracing project id, also called session_id, can be found in the url of the LS experiment page
preview: Whether to return lightweight preview data only. When True,
fetches inputs_preview/outputs_preview summaries instead of full inputs/outputs from S3 storage.
Faster and less bandwidth.
comparative_experiment_id: Optional comparative experiment UUID for pairwise comparison experiment results.
filters: Optional filters to apply to results
limit: Maximum number of results to return

Returns:
ExperimentResults that has stats (TracerSessionResult) and iterator of examples_with_runs (ExampleWithRuns)

Raises:
ValueError: If project not found for the given session_id

Example:
>>> client = Client()
>>> results = client.get_experiment_results(
... project_id="037ae90f-f297-4926-b93c-37d8abf6899f",
... )
>>> for example_with_runs in results["examples_with_runs"]:
... print(example_with_runs.dict())

>>> # Access aggregated experiment stats
>>> print(f"Total runs: {results['stats'].run_count}")
>>> print(f"Total cost: {results['stats'].total_cost}")
>>> print(f"P50 latency: {results['stats'].latency_p50}")

"""
if name and not project_id:
projects = list(self.list_projects(name=name))
if not projects:
raise ValueError(f"No experiment found with name: '{name}'")
project_id = projects[0].id

# Get aggregated stats for the experiment project/session
project_stats = list(
self.list_projects(
project_ids=[cast(uuid.UUID, project_id)], include_stats=True
)
)

if not project_stats:
raise ValueError(f"No experiment found with project_id: '{project_id}'")

dataset_id = project_stats[0].reference_dataset_id

def _get_examples_with_runs_iterator():
"""Yield examples with corresponding experiment runs."""
for batch in self._paginate_examples_with_runs(
dataset_id=dataset_id,
session_id=project_id,
preview=preview,
comparative_experiment_id=comparative_experiment_id,
filters=filters,
limit=limit,
):
yield from batch

return ls_schemas.ExperimentResults(
stats=project_stats[0],
examples_with_runs=_get_examples_with_runs_iterator(),
)


def convert_prompt_to_openai_format(
messages: Any,
Expand Down
2 changes: 0 additions & 2 deletions python/langsmith/evaluation/_arunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,7 +648,6 @@ def _reset_example_attachments(self, example: schemas.Example) -> schemas.Exampl
outputs=example.outputs,
metadata=example.metadata,
modified_at=example.modified_at,
runs=example.runs,
source_run_id=example.source_run_id,
attachments=new_attachments,
_host_url=example._host_url,
Expand Down Expand Up @@ -767,7 +766,6 @@ def _get_example_with_readers(self, example: schemas.Example) -> schemas.Example
outputs=example.outputs,
metadata=example.metadata,
modified_at=example.modified_at,
runs=example.runs,
source_run_id=example.source_run_id,
attachments=new_attachments,
_host_url=example._host_url,
Expand Down
1 change: 0 additions & 1 deletion python/langsmith/evaluation/_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1391,7 +1391,6 @@ def _reset_example_attachment_readers(
outputs=example.outputs,
metadata=example.metadata,
modified_at=example.modified_at,
runs=example.runs,
source_run_id=example.source_run_id,
attachments=new_attachments,
_host_url=example._host_url,
Expand Down
17 changes: 16 additions & 1 deletion python/langsmith/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

from collections.abc import Iterator
from datetime import datetime, timedelta, timezone
from decimal import Decimal
from enum import Enum
Expand Down Expand Up @@ -158,7 +159,6 @@ class Example(ExampleBase):
)
dataset_id: UUID = Field(default=UUID("00000000-0000-0000-0000-000000000000"))
modified_at: Optional[datetime] = Field(default=None)
runs: list[Run] = Field(default_factory=list)
source_run_id: Optional[UUID] = None
attachments: Optional[dict[str, AttachmentInfo]] = Field(default=None)
"""Dictionary with attachment names as keys and a tuple of the S3 url
Expand Down Expand Up @@ -1261,3 +1261,18 @@ class UpsertExamplesResponse(TypedDict):
"""The number of examples that were upserted."""
example_ids: list[str]
"""The ids of the examples that were upserted."""


class ExampleWithRuns(Example):
"""Example with runs."""

runs: list[Run] = Field(default_factory=list)

"""The runs of the example."""


class ExperimentResults(TypedDict):
"""Results container for experiment data with stats and examples."""

stats: TracerSessionResult
examples_with_runs: Iterator[ExampleWithRuns]
80 changes: 80 additions & 0 deletions python/tests/integration_tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3567,3 +3567,83 @@ def export_batch(self, run_ops, otel_context_map):
readable_span.attributes[_otel_exporter.GENAI_COMPLETION]
== '{"answer":"Hello, User!"}'
)


def test_get_experiment_results(langchain_client: Client) -> None:
"""Test get_experiment_results method with evaluation data."""
dataset_name = "__test_evaluate_attachments" + uuid4().hex[:4]
dataset = _create_dataset(langchain_client, dataset_name)

# Create example with attachments
example = ExampleCreate(
inputs={"question": "What is shown in the image?"},
outputs={"answer": "test image"},
attachments={
"image": ("image/png", b"fake image data for testing"),
},
)

langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example])

def target(inputs: Dict[str, Any], attachments: Dict[str, Any]) -> Dict[str, Any]:
# Verify we receive the attachment data
assert "image" in attachments
assert "presigned_url" in attachments["image"]
image_data = attachments["image"]["reader"]
assert image_data.read() == b"fake image data for testing"
return {"answer": "test image"}

def evaluator(
outputs: dict, reference_outputs: dict, attachments: dict
) -> Dict[str, Any]:
assert "image" in attachments
assert "presigned_url" in attachments["image"]
image_data = attachments["image"]["reader"]
assert image_data.read() == b"fake image data for testing"
return {
"score": float(
reference_outputs.get("answer") == outputs.get("answer") # type: ignore
)
}

results = langchain_client.evaluate(
target,
data=dataset_name,
evaluators=[evaluator],
num_repetitions=2,
)

assert len(results) == 2

experiment_name = results.experiment_name

time.sleep(10)
# Test get_experiment_results method
experiment_results = langchain_client.get_experiment_results(name=experiment_name)

# Test that we get stats
assert experiment_results["stats"] is not None
stats = experiment_results["stats"]
assert hasattr(stats, "run_count")
assert stats.run_count > 0

# Test that we get examples iterator
examples_list = list(experiment_results["examples_with_runs"])
assert len(examples_list) > 0
# Test with limit parameter
limited_results = langchain_client.get_experiment_results(
name=experiment_name, limit=1
)
limited_examples = list(limited_results["examples_with_runs"])
assert len(limited_examples) == 1

# Test stats are the same regardless of limit (since stats come from project)
assert limited_results["stats"].run_count == experiment_results["stats"].run_count

# Test preview mode - should be faster and return preview data
preview_results = langchain_client.get_experiment_results(
name=experiment_name, preview=True
)
assert len(list(preview_results["examples_with_runs"])) > 0

safe_delete_dataset(langchain_client, dataset_name=dataset_name)
4 changes: 2 additions & 2 deletions vendor/orjson/test/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def test_loads_recursion_valid_limit_mixed(self):
loads() recursion limit at limit mixed
"""
n = LOADS_RECURSION_LIMIT
value = b"[" b'{"key":' * n + b'{"key":true}' + b"}" * n + b"]"
value = b'[{"key":' * n + b'{"key":true}' + b"}" * n + b"]"
pytest.raises(orjson.JSONDecodeError, orjson.loads, value)

def test_loads_recursion_valid_excessive_array(self):
Expand Down Expand Up @@ -111,7 +111,7 @@ def test_loads_recursion_valid_limit_mixed_pretty(self):
loads() recursion limit at limit mixed pretty
"""
n = LOADS_RECURSION_LIMIT
value = b"[\n " b'{"key":' * n + b'{"key":true}' + b"}" * n + b"]"
value = b'[\n {"key":' * n + b'{"key":true}' + b"}" * n + b"]"
pytest.raises(orjson.JSONDecodeError, orjson.loads, value)

def test_loads_recursion_valid_excessive_array_pretty(self):
Expand Down
Loading