diff --git a/.github/workflows/create-release.yml b/.github/workflows/create-release.yml index a8da4e59..c509cffa 100644 --- a/.github/workflows/create-release.yml +++ b/.github/workflows/create-release.yml @@ -52,7 +52,7 @@ jobs: - name: Adding file run: | - git add pyproject.toml + git add pyproject.toml uv.lock git fetch --quiet --tags git commit -m "v${{ inputs.version }}" --allow-empty git tag v${{ inputs.version }} diff --git a/README.md b/README.md index 10c06369..08d9a9ff 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ Either use `hub.projects.list()` to get a list of all projects, or use ### Import a dataset -Let's now create a dataset and add a conversation example. +Let's now create a dataset and add a chat test case example. ```python # Let's create a dataset @@ -80,12 +80,12 @@ dataset = hub.datasets.create( ) ``` -We can now add a conversation example to the dataset. This will be used +We can now add a chat test case example to the dataset. This will be used for the model evaluation. ```python -# Add a conversation example -hub.conversations.create( +# Add a chat test case example +hub.chat_test_cases.create( dataset_id=dataset.id, messages=[ dict(role="user", content="What is the capital of France?"), @@ -107,10 +107,10 @@ hub.conversations.create( ) ``` -These are the attributes you can set for a conversation (the only +These are the attributes you can set for a chat test case (the only required attribute is `messages`): -- `messages`: A list of messages in the conversation. Each message is a dictionary with the following keys: +- `messages`: A list of messages in the chat. Each message is a dictionary with the following keys: - `role`: The role of the message, either "user" or "assistant". - `content`: The content of the message. @@ -118,10 +118,10 @@ required attribute is `messages`): - `demo_output`: A demonstration of a (possibly wrong) output from the model with an optional metadata. This is just for demonstration purposes. - - `checks`: A list of checks that the conversation should pass. This is used for evaluation. Each check is a dictionary with the following keys: + - `checks`: A list of checks that the chat test case should pass. This is used for evaluation. Each check is a dictionary with the following keys: - `identifier`: The identifier of the check. If it's a built-in check, you will also need to provide the `params` dictionary. The built-in checks are: - `correctness`: The output of the model should match the reference. - - `conformity`: The conversation should follow a set of rules. + - `conformity`: The chat should follow a set of rules. - `groundedness`: The output of the model should be grounded in the conversation. - `string_match`: The output of the model should contain a specific string (keyword or sentence). - `metadata`: The metadata output of the model should match a list of JSON path rules. @@ -137,15 +137,13 @@ required attribute is `messages`): - `expected_value_type`: The expected type of the value at the JSON path, one of `string`, `number`, `boolean`. - For the `semantic_similarity` check, the parameters are `reference` (type: `str`) and `threshold` (type: `float`), where `reference` is the expected output and `threshold` is the similarity score below which the check will fail. -You can add as many conversations as you want to the dataset. +You can add as many chat test cases as you want to the dataset. Again, you'll find your newly created dataset in the Hub UI. ### Configure a model/agent -Before running our first evaluation, we'll need to set up a model. -You'll need an API endpoint ready to serve the model. Then, you can -configure the model API in the Hub: +Before running our first evaluation, we'll need to set up a model. You'll need an API endpoint ready to serve the model. Then, you can configure the model API in the Hub: ```python model = hub.models.create( @@ -159,8 +157,7 @@ model = hub.models.create( ) ``` -We can test that everything is working well by running a chat with the -model: +We can test that everything is working well by running a chat with the model: ```python response = model.chat( @@ -198,8 +195,7 @@ eval_run = client.evaluate( ) ``` -The evaluation will run asynchronously on the Hub. To retrieve the -results once the run is complete, you can use the following: +The evaluation will run asynchronously on the Hub. To retrieve the results once the run is complete, you can use the following: ```python @@ -213,5 +209,4 @@ eval_run.print_metrics() **Tip** You can directly pass IDs to the evaluate function, e.g. -`model=model_id` and `dataset=dataset_id`, without having to retrieve -the objects first. \ No newline at end of file +`model=model_id` and `dataset=dataset_id`, without having to retrieve the objects first. \ No newline at end of file diff --git a/examples/example.sh b/examples/example.sh deleted file mode 100755 index 02b6e640..00000000 --- a/examples/example.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -set -eu -# Set env variable, to avoid giving this info everytime -export GSK_API_KEY=2b437295-dabe-4084-ba03-cdb259e3e678 -export GSK_HUB_URL=http://backend.llm.localhost/ -folder_path=./test-folder - -rm -rf $folder_path -mkdir -p $folder_path - - -project_id=$(python -m giskard_hub.cli projects | jq --raw-output .[0].id) -model_id=$(python -m giskard_hub.cli models --project-id $project_id | jq --raw-output .[0].id) -dataset_id=$(python -m giskard_hub.cli datasets --project-id $project_id | jq --raw-output .[0].id) - -python giskard_hub.cli evaluate --folder-path $folder_path --dataset-id $dataset_id --model-id $model_id --local-mode -execution_id=$(find $folder_path -type f | grep ".json$" | head -n1 | xargs -I {} jq --raw-output .execution_id {}) - -# Following line is faking changing data into the json with the agent -find $folder_path -type f | grep ".json$" | xargs -I {} sed -i 's|"output": null|"output": "Sry, I was not paying attention"|g' {} | sh -python giskard_hub.cli update-evaluations --evaluation-path $folder_path - -python giskard_hub.cli results --execution-id $execution_id diff --git a/examples/example_python.py b/examples/example_python.py deleted file mode 100644 index e4dd459c..00000000 --- a/examples/example_python.py +++ /dev/null @@ -1,53 +0,0 @@ -from giskard_hub.client import HubClient -from giskard_hub.data import Evaluation, Project - - -def dummy_model(all_data: Evaluation): - # Here, all data contains everything - # - rules - # - tags - # - the conversation - # and so on.. - # Most probably, user will want to take extra care of all_data.conversation.messages - - # Following line is simulated calling a dummy agent and updating the evaluation - all_data.set_output("Sry, I was not paying attention") - - # Alternaltively, could be done like this - # all_data.output = ModelOutput(response=LLMMessage(role="assistant", content="Sry, I was not paying attention"), metadata={}) - - -if __name__ == "__main__": - # Initialise client - client = HubClient( - api_key="2b437295-dabe-4084-ba03-cdb259e3e678", - hub_url="http://backend.llm.localhost/", - ) - # api_key and hub url can also be provided by setting env variable GSK_API_KEY and GSK_HUB_URL - project: Project = client.get_projects()[0] - - # Get models and check if any exist - models = client.models.list(project.id) - if not models: - raise ValueError(f"No models found in project {project.id}") - model = models[0] - - # Get datasets and check if any exist - datasets = client.get_datasets(project.id) - if not datasets: - raise ValueError(f"No datasets found in project {project.id}") - dataset = datasets[0] - - to_complete = client.evaluate( - model=model.id, - dataset=dataset.id, - ) - execution_id = to_complete[0].execution_id - for elt in to_complete: - dummy_model(elt) - - updates = client.update_evaluations(to_complete) - - results = client.get_results(execution_id=execution_id) - print("Got results") - print(results) diff --git a/script-docs/hub/sdk/checks.rst b/script-docs/hub/sdk/checks.rst index 8b61ccfe..c35d8a7e 100644 --- a/script-docs/hub/sdk/checks.rst +++ b/script-docs/hub/sdk/checks.rst @@ -11,7 +11,7 @@ The Giskard Hub provides a set of built-in checks that cover common use cases, s * **Correctness**: Verifies if the agent's response matches the expected output (reference answer). * **Conformity**: Ensures the agent's response adheres to the rules, such as "The agent must be polite." -* **Groundedness**: Ensures the agent's response is grounded in the conversation. +* **Groundedness**: Ensures the agent's response is grounded to a specific context. * **String matching**: Checks if the agent's response contains a specific string, keyword, or sentence. * **Metadata**: Verifies the presence of specific (tool calls, user information, etc.) metadata in the agent's response. * **Semantic Similarity**: Verifies that the agent's response is semantically similar to the expected output. @@ -46,7 +46,7 @@ Custom checks are reusable evaluation criteria that you can define for your proj Custom checks can be used in the following ways: -- Applied to conversations in your datasets +- Applied to chat test cases (conversations) in your datasets - Used during agent evaluations - Shared across your team **within the same project** - Modified or updated as your requirements evolve @@ -243,7 +243,7 @@ You can delete a check using the ``hub.checks.delete()`` method. Here's a basic .. warning:: - Deleting a check is permanent and cannot be undone. Make sure you're not using the check in any active conversations or evaluations before deleting it. + Deleting a check is permanent and cannot be undone. Make sure you're not using the check in any active chat test cases or evaluations before deleting it. List checks ___________ @@ -263,15 +263,15 @@ You can list all checks for a project using the ``hub.checks.list()`` method. He .. _add-checks-to-conversations: -Add checks to conversations +Add checks to chat test cases --------------------------- -Once you've created a check, you can use it in your conversations by referencing its identifier: +Once you've created a check, you can use it in your chat test cases by referencing its identifier: .. code-block:: python - # Add a conversation that uses your check - hub.conversations.create( + # Add a chat test case that uses your check + hub.chat_test_cases.create( dataset_id=dataset.id, messages=[ {"role": "user", "content": "What's the formula for compound interest?"}, diff --git a/script-docs/hub/sdk/datasets/business.rst b/script-docs/hub/sdk/datasets/business.rst index 22f75a2e..d6ff1064 100644 --- a/script-docs/hub/sdk/datasets/business.rst +++ b/script-docs/hub/sdk/datasets/business.rst @@ -5,7 +5,7 @@ Detect business failures by generating synthetic tests ====================================================== -Generative AI agents can face an endless variety of real-world scenarios, making it impossible to manually enumerate all possible test cases. Automated, synthetic test case generation is therefore essential—especially when you lack real user conversations to import as tests. However, a major challenge is to ensure that these synthetic cases are tailored to your business context, rather than being overly generic. +Generative AI agents can face an endless variety of real-world scenarios, making it impossible to manually enumerate all possible scenarios. Automated, synthetic test case generation is therefore essential—especially when you lack real user chats to import as tests. However, a major challenge is to ensure that these synthetic cases are tailored to your business context, rather than being overly generic. By generating domain-specific synthetic tests, you can proactively identify and address these types of failures before they impact your users or business operations. @@ -31,9 +31,9 @@ Before generating test cases, you need to `create a knowledge base ", name="My updated dataset") -Alternatively, you can update a dataset by managing its :ref:`conversations `. +Alternatively, you can update a dataset by managing its :ref:`chat test cases `. Delete a dataset ________________ @@ -102,36 +101,34 @@ You can delete a dataset using the ``hub.datasets.delete()`` method. Here's a ba hub.datasets.delete("") -.. _conversations: +.. _chat_test_cases: -Conversations -------------- +Chat Test Cases +=============== -A conversation is a collection of messages together with evaluation checks (e.g., the expected answer, or rules that the agent must follow when responding). +A chat test case (conversation) is a collection of messages together with evaluation checks (e.g., the expected answer, or rules that the agent must follow when responding). -Create a conversation +Create a chat test case _____________________ -You can now add conversations to the dataset. Conversations are a collection of messages together with evaluation checks (e.g., the expected answer, or rules that the agent must follow when responding). +The parameters for creating a chat test case are: -The parameters for creating a conversation are: - -- **dataset_id** (required): The ID of the dataset where the conversation will be created. +- **dataset_id** (required): The ID of the dataset where the chat test case will be created. - **messages** (required): A list of messages, without the last assistant answer. Each message is a dictionary with keys ``role`` and ``content``. - **demo_output** (optional): A dictionary with the last assistant answer -- **tags** (optional): A list of tags you can use to categorize and organize the conversations +- **tags** (optional): A list of tags you can use to categorize and organize the chat test cases - **checks** (optional): A list of checks. For more information on checks, see the :doc:`/hub/sdk/checks` section. .. note:: **Do not include last assistant answer in the list of messages.** In fact, during evaluation, we will pass - the conversation to your agent and expect it to generate an assistant answer. The newly generated answer will + the chat test case to your agent and expect it to generate an assistant answer. The newly generated answer will be evaluated against the checks. - If you want to show the last assistant answer to the user, you can include it in the conversation as ``demo_output``. + If you want to show the last assistant answer to the user, you can include it in the chat test case as ``demo_output``. In this way, it will be shown in the dataset, but not used in the evaluation. .. code-block:: python - hub.conversations.create( + hub.chat_test_cases.create( dataset_id=dataset.id, # A list of messages, without the last assistant answer @@ -160,54 +157,53 @@ The parameters for creating a conversation are: ] ) -Retrieve conversations +Retrieve chat test cases ______________________ -You can also retrieve existing conversations for editing or deletion. +You can also retrieve existing chat test cases for editing or deletion. -For example, in certain cases you may want programmatically assign certain annotations to the conversation, or update -the conversation with the new data. +For example, in certain cases you may want programmatically assign certain annotations to the chat test case, or update it with new data. .. code-block:: python - # Retrieve all conversations - conversations = hub.conversations.list(dataset_id=dataset.id) + # Retrieve all chat test cases + chat_test_cases = hub.chat_test_cases.list(dataset_id=dataset.id) # Or simply - conversations = dataset.conversations + chat_test_cases = dataset.chat_test_cases -Update a conversation +Update a chat test case _____________________ -After retrieving the conversations, we can update them. -For example, let's say we want to add the tag "tech" to all conversations containing the word "laptop" in the user message: +After retrieving the chat test cases, we can update them. +For example, let's say we want to add the tag "tech" to all chat test cases containing the word "laptop" in the user message: .. code-block:: python - # Update the conversations - for conversation in conversations: - if "laptop" in conversation.messages[0].content: + # Update the chat test cases + for chat_test_case in chat_test_cases: + if "laptop" in chat_test_case.messages[0].content: # This will only update the tags, without changing the other fields - hub.conversations.update( - conversation.id, - tags=conversation.tags + ["tech"] + hub.chat_test_cases.update( + chat_test_case.id, + tags=chat_test_case.tags + ["tech"] ) -Delete a conversation +Delete a chat test case _____________________ -Finally, you can delete conversations that you no longer need. For example: +Finally, you can delete chat test cases that you no longer need. For example: .. code-block:: python - conversation_to_delete = dataset.conversations[0] + chat_test_case_to_delete = dataset.chat_test_cases[0] - hub.conversations.delete(conversation_to_delete.id) + hub.chat_test_cases.delete(chat_test_case_to_delete.id) .. warning:: - Deleting a conversation is permanent and cannot be undone. Make sure you're not using the conversation in any active evaluations before deleting it. + Deleting a chat test case is permanent and cannot be undone. Make sure you're not using the chat test case in any active evaluations before deleting it. .. toctree:: :hidden: diff --git a/script-docs/hub/sdk/datasets/security.rst b/script-docs/hub/sdk/datasets/security.rst index e359f3e9..16b9eac3 100644 --- a/script-docs/hub/sdk/datasets/security.rst +++ b/script-docs/hub/sdk/datasets/security.rst @@ -36,15 +36,15 @@ The ``generate_adversarial`` method creates test cases designed to expose securi "desc": "Tests for unintended information leakage" } ], - n_examples=20 # Optional: number of conversations per category to generate + n_examples=20 # Optional: number of chat test cases per category to generate ) # Wait for the dataset to be created security_dataset.wait_for_completion() - # List the conversations in the dataset - for conversation in security_dataset.conversations: - print(conversation.messages[0].content) + # List the chat test cases in the dataset + for chat_test_case in security_dataset.chat_test_cases: + print(chat_test_case.messages[0].content) .. note:: diff --git a/script-docs/hub/sdk/evaluations.rst b/script-docs/hub/sdk/evaluations.rst index 00607180..a19a8e68 100644 --- a/script-docs/hub/sdk/evaluations.rst +++ b/script-docs/hub/sdk/evaluations.rst @@ -16,7 +16,7 @@ The Giskard Hub provides a comprehensive evaluation system that supports: In this section, we will walk you through how to run and manage evaluations using the SDK. -- An **evaluation** is a run of an agent on each conversation of a dataset using a set of checks. +- An **evaluation** is a run of an agent on each chat test case (conversation) of a dataset using a set of checks. We recommend to systematically launch evaluation runs every time you deploy an updated agent in a pre-production or staging environment. In this way, you can collaborate with your team to ensure that the agent is performing as expected. @@ -181,7 +181,7 @@ During the development phase, you may want to **evaluate a local model** that is Running the evaluation will allow you to compare the performance of your local model with the one that is already in production, or with other models that you use as a baseline. You will also be able to debug performance issues by -checking each conversation in the Hub inteface. +checking each chat test case (conversation) in the Hub inteface. As usual, let's initialize the Hub client and set our current project ID: @@ -349,12 +349,12 @@ You can list evaluation results using the ``hub.evaluations.list_entries()`` met eval_results = hub.evaluations.list_entries(eval_run.id) -Each evaluation entry contains detailed information about the test case execution, including the conversation, model output, evaluation results, and optionally a failure category: +Each evaluation entry contains detailed information about the test case execution, including the chat test case, model output, evaluation results, and optionally a failure category: .. code-block:: python for entry in eval_results: - print(f"Conversation ID: {entry.conversation.id}") + print(f"Chat Test Case ID: {entry.chat_test_case.id}") # Check if there's a failure category assigned if entry.failure_category: @@ -367,7 +367,7 @@ Each evaluation entry contains detailed information about the test case executio # Check evaluation results if not entry.results: - print("No checks were run for this conversation") + print("No checks were run for this chat test case") for result in entry.results: print("-" * 50) print(f"Check: {result['name']}") diff --git a/script-docs/hub/sdk/index.rst b/script-docs/hub/sdk/index.rst index 80878c59..8c98bd2a 100644 --- a/script-docs/hub/sdk/index.rst +++ b/script-docs/hub/sdk/index.rst @@ -14,11 +14,11 @@ The Giskard Hub SDK provides a Python interface to interact with the Giskard Hub Create, update, and organize projects, agents and knowledge bases - .. grid-item-card:: Manage datasets and conversations + .. grid-item-card:: Manage datasets and chat test cases :link: datasets/index :link-type: doc - Create, update, and organize test datasets and conversations manually or using synthetic data generation + Create, update, and organize test datasets and chat test cases manually or using synthetic data generation .. grid-item-card:: Manage checks :link: checks @@ -123,7 +123,7 @@ That's it! You have created a project. Import a dataset ________________ -Let's now create a dataset and add a conversation example. +Let's now create a dataset and add a chat test case example. .. code-block:: python @@ -135,14 +135,14 @@ Let's now create a dataset and add a conversation example. ) -We can now add a conversation example to the dataset. This will be used for the model evaluation. +We can now add a chat test case example to the dataset. This will be used for the model evaluation. .. code-block:: python import random - # Add a conversation example - hub.conversations.create( + # Add a chat test case example + hub.chat_test_cases.create( dataset_id=dataset.id, messages=[ dict(role="user", content="What is the capital of France?"), @@ -165,23 +165,23 @@ We can now add a conversation example to the dataset. This will be used for the ] ) -These are the attributes you can set for a conversation (the only required attribute is ``messages``): +These are the attributes you can set for a chat test case (the only required attribute is ``messages``): -- ``messages``: A list of messages in the conversation. Each message is a dictionary with the following keys: +- ``messages``: A list of messages in the chat. Each message is a dictionary with the following keys: - ``role``: The role of the message, either "user" or "assistant". - ``content``: The content of the message. - ``demo_output``: A demonstration of a (possibly wrong) output from the model with an optional metadata. This is just for demonstration purposes. -- ``checks``: A list of checks that the conversation should pass. This is used for evaluation. Each check is a dictionary with the following keys: +- ``checks``: A list of checks that the chat should pass. This is used for evaluation. Each check is a dictionary with the following keys: - ``identifier``: The identifier of the check. If it's a built-in check, you will also need to provide the ``params`` dictionary. The built-in checks are: - ``correctness``: The output of the model should match the reference. - - ``conformity``: The conversation should follow a set of rules. - - ``groundedness``: The output of the model should be grounded in the conversation. + - ``conformity``: The chat test case should follow a set of rules. + - ``groundedness``: The output of the model should be grounded to a specific context. - ``string_match``: The output of the model should contain a specific string (keyword or sentence). - ``metadata``: The metadata output of the model should match a list of JSON path rules. - ``semantic_similarity``: The output of the model should be semantically similar to the reference. - ``params``: A dictionary of parameters for the check. The parameters depend on the check type: - For the ``correctness`` check, the parameter is ``reference`` (type: ``str``), which is the expected output. - - For the ``conformity`` check, the parameter is ``rules`` (type: ``list[str]``), which is a list of rules that the conversation should follow. + - For the ``conformity`` check, the parameter is ``rules`` (type: ``list[str]``), which is a list of rules that the chat should follow. - For the ``groundedness`` check, the parameter is ``context`` (type: ``str``), which is the context in which the model should ground its output. - For the ``string_match`` check, the parameter is ``keyword`` (type: ``str``), which is the string that the model's output should contain. - For the ``metadata`` check, the parameter is ``json_path_rules`` (type: ``list[dict]``), which is a list of dictionaries with the following keys: @@ -194,7 +194,7 @@ These are the attributes you can set for a conversation (the only required attri For detailed information about these checks, including examples and how they work, see :doc:`/hub/ui/annotate`. -You can add as many conversations as you want to the dataset. +You can add as many chat test cases as you want to the dataset. Configure an Agent ___________________ diff --git a/script-docs/hub/sdk/projects.rst b/script-docs/hub/sdk/projects.rst index 26f71f74..015f1746 100644 --- a/script-docs/hub/sdk/projects.rst +++ b/script-docs/hub/sdk/projects.rst @@ -10,7 +10,7 @@ Projects are the top-level organizational units in Giskard Hub. They provide a w Each project can contain: * **Agents**: The AI systems you want to test and evaluate -* **Datasets**: Collections of test cases and conversations +* **Datasets**: Collections of chat test cases (conversations) * **Knowledge bases**: Domain-specific information sources * **Evaluations**: Test runs and their results * **Users and groups**: Team members with different access levels diff --git a/script-docs/hub/sdk/reference/client.rst b/script-docs/hub/sdk/reference/client.rst index 46fe4658..ed9d6ae5 100644 --- a/script-docs/hub/sdk/reference/client.rst +++ b/script-docs/hub/sdk/reference/client.rst @@ -8,7 +8,7 @@ Client reference The HubClient is the main entry point for interacting with the Giskard Hub through the Python SDK. .. autoclass:: giskard_hub.client.HubClient - :members: projects, datasets, conversations, agents, models, evaluations, evals + :members: chat_test_cases, checks, datasets, evaluations, knowledge_bases, models, projects, scheduled_evaluations, evaluate :show-inheritance: :no-index: diff --git a/script-docs/hub/sdk/reference/resources/index.rst b/script-docs/hub/sdk/reference/resources/index.rst index 83b9c429..7c795f0c 100644 --- a/script-docs/hub/sdk/reference/resources/index.rst +++ b/script-docs/hub/sdk/reference/resources/index.rst @@ -1,11 +1,11 @@ :og:title: Giskard Hub - Resources Reference -:og:description: Learn about the resource classes in Giskard Hub. Understand how to access and manage different entities through the client. +:og:description: Learn about the resource classes in Giskard Hub. Understand how to access and manage different entities through the SDK. =================== Resources reference =================== -Resources provide access to different entities in Giskard Hub through the client, such as projects, datasets, and models. +Resources provide access to different entities in Giskard Hub through the client, such as projects, datasets, models and others. .. automodule:: giskard_hub.resources :members: diff --git a/src/giskard_hub/__init__.py b/src/giskard_hub/__init__.py index 5d93cc01..cadc48eb 100644 --- a/src/giskard_hub/__init__.py +++ b/src/giskard_hub/__init__.py @@ -1,21 +1,9 @@ from __future__ import annotations from .client import HubClient -from .data import Dataset, Model, Project -from .data.chat import ChatMessage -from .data.chat_test_case import ChatTestCase -from .data.conversation import Conversation -hub_url: str | None = None -api_key: str | None = None +# Import data module's __all__ to avoid duplication +from .data import * +from .data import __all__ as _data_all - -__all__ = [ - "Dataset", - "ChatTestCase", - "Conversation", - "ChatMessage", - "Project", - "Model", - "HubClient", -] +__all__ = ["HubClient"] + _data_all diff --git a/src/giskard_hub/client.py b/src/giskard_hub/client.py index 15eafeaa..ba895c03 100644 --- a/src/giskard_hub/client.py +++ b/src/giskard_hub/client.py @@ -13,7 +13,6 @@ from .errors import HubConnectionError from .resources.chat_test_cases import ChatTestCasesResource from .resources.checks import ChecksResource -from .resources.conversations import ConversationsResource from .resources.datasets import DatasetsResource from .resources.evaluations import EvaluationsResource from .resources.knowledge_bases import KnowledgeBasesResource @@ -23,26 +22,19 @@ # pylint: disable=too-many-instance-attributes -# The `conversations` resource is deprecated and will be removed in the future. class HubClient(SyncClient): """Client class to handle interaction with the hub. Attributes ---------- - projects : ProjectsResource - Resource to interact with projects. - - datasets : DatasetsResource - Resource to interact with datasets. - chat_test_cases : ChatTestCasesResource - Resource to interact with chat test cases. + Resource to interact with chat test cases (conversations). - conversations : ConversationsResource - Resource to interact with conversations. + checks : ChecksResource + Resource to interact with checks. - models : ModelsResource - Resource to interact with models. + datasets : DatasetsResource + Resource to interact with datasets. evaluations : EvaluationsResource Resource to interact with evaluations. @@ -50,20 +42,23 @@ class HubClient(SyncClient): knowledge_bases : KnowledgeBasesResource Resource to interact with knowledge bases. + models : ModelsResource + Resource to interact with models. + + projects : ProjectsResource + Resource to interact with projects. + scheduled_evaluations : ScheduledEvaluationsResource Resource to interact with scheduled evaluations. - - evals : EvaluationsResource - Alias for `evaluations`. """ - projects: ProjectsResource - datasets: DatasetsResource chat_test_cases: ChatTestCasesResource - conversations: ConversationsResource - evaluations: EvaluationsResource checks: ChecksResource + datasets: DatasetsResource + evaluations: EvaluationsResource knowledge_bases: KnowledgeBasesResource + models: ModelsResource + projects: ProjectsResource scheduled_evaluations: ScheduledEvaluationsResource def __init__( @@ -136,20 +131,15 @@ def __init__( ) # Define the resources - self.projects = ProjectsResource(self) - self.datasets = DatasetsResource(self) self.chat_test_cases = ChatTestCasesResource(self) - self.conversations = ConversationsResource(self) - self.models = ModelsResource(self) - self.evaluations = EvaluationsResource(self) self.checks = ChecksResource(self) + self.datasets = DatasetsResource(self) + self.evaluations = EvaluationsResource(self) self.knowledge_bases = KnowledgeBasesResource(self) + self.models = ModelsResource(self) + self.projects = ProjectsResource(self) self.scheduled_evaluations = ScheduledEvaluationsResource(self) - @property - def evals(self): - return self.evaluations - def _headers(self): return { "X-API-Key": self._api_key, @@ -159,24 +149,25 @@ def _headers(self): def evaluate( self, *, - dataset: Dataset | str, - tags: List[str] = NOT_GIVEN, + dataset: str | Dataset, model: Model | str | Callable[[List[ChatMessage]], ModelOutput | str], name: str = NOT_GIVEN, + tags: List[str] = NOT_GIVEN, ): - """Evaluate a model on a dataset. + """Method to run an evaluation, either locally or remotely. Parameters ---------- dataset : str | Dataset - ID of the dataset that will be used for the evaluation, or the dataset entity. - tags: List[str], optional - List of tags to filter the conversations (chat test cases) that will be evaluated. + ID of the dataset that will be used for the evaluation, or the dataset entity itself. + List of tags to filter the chat test cases that will be evaluated. model : str | Model | Callable[[List[ChatMessage]], ModelOutput | str] ID of the model to evaluate, or a model entity, or a local model function. A local model function is a function that takes a list of messages and returns a `ModelOutput` or a string. name : str, optional The name of the evaluation run. If not provided, a random name will be automatically generated. + tags: List[str], optional + List of tags to filter the chat test cases that will be evaluated. Returns ------- @@ -201,9 +192,9 @@ def evaluate( return self.evaluations.create( dataset_id=dataset_id, - tags=tags, model_id=entity_to_id(model, Model), name=name, + tags=tags, ) def _run_local_eval( @@ -220,7 +211,7 @@ def _run_local_eval( # Run the local model entries = self.evaluations.list_entries(eval_run.id) for entry in entries: - model_output = model(entry.conversation.messages) + model_output = model(entry.chat_test_case.messages) self.evaluations.update_entry( eval_run.id, entry.id, model_output=model_output ) diff --git a/src/giskard_hub/data/__init__.py b/src/giskard_hub/data/__init__.py index a8c2aa75..d4db18c8 100644 --- a/src/giskard_hub/data/__init__.py +++ b/src/giskard_hub/data/__init__.py @@ -1,7 +1,6 @@ from .chat import ChatMessage from .chat_test_case import ChatTestCase from .check import Check -from .conversation import Conversation from .dataset import Dataset from .evaluation import EvaluationRun, Metric, ModelOutput from .knowledge_base import Document, KnowledgeBase, Topic @@ -12,7 +11,6 @@ __all__ = [ "Project", "Dataset", - "Conversation", "ChatTestCase", "Check", "ChatMessage", diff --git a/src/giskard_hub/data/conversation.py b/src/giskard_hub/data/conversation.py deleted file mode 100644 index 23ca0c30..00000000 --- a/src/giskard_hub/data/conversation.py +++ /dev/null @@ -1,32 +0,0 @@ -from __future__ import annotations - -import warnings -from dataclasses import dataclass -from typing import Any, Dict - -from .chat_test_case import ChatTestCase - - -@dataclass -class Conversation(ChatTestCase): - """A Dataset entry representing a conversation. - - Attributes - ---------- - messages : List[ChatMessage] - List of messages in the conversation. Each message is an object with a role and content attributes. - demo_output : Optional[ChatMessageWithMetadata], optional - Output of the agent for demonstration purposes. - tags : List[str], optional - List of tags for the conversation. - checks : List[CheckConfig], optional - List of checks to be performed on the conversation. - """ - - @classmethod - def from_dict(cls, data: Dict[str, Any], **kwargs) -> "Conversation": - warnings.warn( - "Conversation is deprecated and will be removed. Please use ChatTestCase instead.", - category=DeprecationWarning, - ) - return super().from_dict(data, **kwargs) diff --git a/src/giskard_hub/data/dataset.py b/src/giskard_hub/data/dataset.py index 3d089ab5..d6cce09e 100644 --- a/src/giskard_hub/data/dataset.py +++ b/src/giskard_hub/data/dataset.py @@ -1,12 +1,10 @@ from __future__ import annotations -import warnings from dataclasses import dataclass, field from typing import List, Optional from ._entity import EntityWithTaskProgress from .chat_test_case import ChatTestCase -from .conversation import Conversation from .task import TaskProgress @@ -19,32 +17,6 @@ class Dataset(EntityWithTaskProgress): project_id: Optional[str] = field(default=None) tags: List[str] = field(default_factory=list) - @property - def conversations(self): - """Return the conversations of the dataset.""" - warnings.warn( - "Conversation is deprecated and will be removed. Please use ChatTestCase operations instead.", - category=DeprecationWarning, - ) - if self._client and self.id: - return self._client.conversations.list(dataset_id=self.id) - return None - - def create_conversation(self, conversation: Conversation): - """Add a conversation to the dataset.""" - warnings.warn( - "Conversation is deprecated and will be removed. Please use ChatTestCase operations instead.", - category=DeprecationWarning, - ) - if not self._client or not self.id: - raise ValueError( - "This dataset instance is detached or unsaved, cannot add conversation." - ) - - return self._client.conversations.create( - dataset_id=self.id, **conversation.to_dict() - ) - @property def chat_test_cases(self): """Return the chat test cases of the dataset.""" diff --git a/src/giskard_hub/data/evaluation.py b/src/giskard_hub/data/evaluation.py index 2c7ea97b..68162ee8 100644 --- a/src/giskard_hub/data/evaluation.py +++ b/src/giskard_hub/data/evaluation.py @@ -10,7 +10,6 @@ from ._base import BaseData from ._entity import Entity, EntityWithTaskProgress from .chat_test_case import ChatTestCase -from .conversation import Conversation from .dataset import Dataset from .model import Model, ModelOutput from .project import FailureCategory @@ -142,7 +141,7 @@ class EvaluationEntry(Entity): """Evaluation entry.""" run_id: str - conversation: Conversation | ChatTestCase + chat_test_case: ChatTestCase model_output: ModelOutput | None = None results: List[EvaluatorResult] = field(default_factory=list) status: TaskStatus = TaskStatus.RUNNING @@ -152,10 +151,8 @@ class EvaluationEntry(Entity): def from_dict(cls, data: Dict[str, Any], **kwargs) -> "EvaluationEntry": data = dict(data) - if "chat_test_case" in data: - data["conversation"] = ChatTestCase.from_dict(data["chat_test_case"]) - else: - data["conversation"] = Conversation.from_dict(data["conversation"]) + # Process `chat_test_case` payload + data["chat_test_case"] = ChatTestCase.from_dict(data["chat_test_case"]) output = data.get("output") data["model_output"] = ModelOutput.from_dict(output) if output else None diff --git a/src/giskard_hub/data/scheduled_evaluation.py b/src/giskard_hub/data/scheduled_evaluation.py index 7ee6780c..ea3c1fd3 100644 --- a/src/giskard_hub/data/scheduled_evaluation.py +++ b/src/giskard_hub/data/scheduled_evaluation.py @@ -60,7 +60,7 @@ class ScheduledEvaluation(Entity): # pylint: disable=too-many-instance-attribut dataset_id : str The ID of the dataset to evaluate against. tags : List[str], optional - List of tags to filter the conversations that will be evaluated. + List of tags to filter the chat test cases that will be evaluated. run_count : int The number of times to run each test case (1-5). frequency : FrequencyOption diff --git a/src/giskard_hub/resources/__init__.py b/src/giskard_hub/resources/__init__.py index 2e7df2b4..80f932c9 100644 --- a/src/giskard_hub/resources/__init__.py +++ b/src/giskard_hub/resources/__init__.py @@ -1,5 +1,5 @@ from .chat_test_cases import ChatTestCasesResource -from .conversations import ConversationsResource +from .checks import ChecksResource from .datasets import DatasetsResource from .evaluations import EvaluationsResource from .knowledge_bases import KnowledgeBasesResource @@ -11,7 +11,7 @@ "ProjectsResource", "DatasetsResource", "ChatTestCasesResource", - "ConversationsResource", + "ChecksResource", "ModelsResource", "EvaluationsResource", "KnowledgeBasesResource", diff --git a/src/giskard_hub/resources/conversations.py b/src/giskard_hub/resources/conversations.py deleted file mode 100644 index 62c79a8f..00000000 --- a/src/giskard_hub/resources/conversations.py +++ /dev/null @@ -1,105 +0,0 @@ -from __future__ import annotations - -import warnings -from typing import List, Optional - -from ..data._base import NOT_GIVEN -from ..data.chat import ChatMessage, ChatMessageWithMetadata -from ..data.check import CheckConfig -from ..data.conversation import Conversation -from ._resource import APIResource -from ._utils import prepare_chat_test_case_data as prepare_conversation_data - -_CONVERSATION_DEPRECATION_WARNING = "Conversation API is deprecated and will be removed. Please use ChatTestCase API instead." - - -class ConversationsResource(APIResource): - def retrieve(self, conversation_id: str): - warnings.warn( - _CONVERSATION_DEPRECATION_WARNING, - category=DeprecationWarning, - ) - return self._client.get( - f"/conversations/{conversation_id}", cast_to=Conversation - ) - - # pylint: disable=too-many-arguments - def create( - self, - *, - dataset_id: str, - messages: List[ChatMessage], - demo_output: Optional[ChatMessageWithMetadata] = None, - tags: Optional[List[str]] = None, - checks: Optional[List[CheckConfig]] = None, - ): - warnings.warn( - _CONVERSATION_DEPRECATION_WARNING, - category=DeprecationWarning, - ) - # pylint: disable=similarities - # The `conversations` resource is deprecated and will be removed in the future. - data = prepare_conversation_data( - dataset_id=dataset_id, - messages=messages, - demo_output=demo_output, - tags=tags, - checks=checks, - ) - - return self._client.post( - "/conversations", - json=data, - cast_to=Conversation, - ) - - # pylint: disable=too-many-arguments - def update( - self, - conversation_id: str, - *, - dataset_id: str = NOT_GIVEN, - messages: List[ChatMessage] = NOT_GIVEN, - demo_output: Optional[ChatMessageWithMetadata] = NOT_GIVEN, - tags: Optional[List[str]] = NOT_GIVEN, - checks: Optional[List[CheckConfig]] = NOT_GIVEN, - ) -> Conversation: - warnings.warn( - _CONVERSATION_DEPRECATION_WARNING, - category=DeprecationWarning, - ) - # pylint: disable=similarities - # The `conversations` resource is deprecated and will be removed in the future. - data = prepare_conversation_data( - dataset_id=dataset_id, - messages=messages, - demo_output=demo_output, - tags=tags, - checks=checks, - ) - - return self._client.patch( - f"/conversations/{conversation_id}", - json=data, - cast_to=Conversation, - ) - - def delete(self, conversation_id: str | List[str]) -> None: - warnings.warn( - _CONVERSATION_DEPRECATION_WARNING, - category=DeprecationWarning, - ) - return self._client.delete( - "/conversations", params={"conversation_ids": conversation_id} - ) - - def list(self, dataset_id: str) -> List[Conversation]: - warnings.warn( - _CONVERSATION_DEPRECATION_WARNING, - category=DeprecationWarning, - ) - data = self._client.get(f"/datasets/{dataset_id}/conversations?limit=100000") - return [ - Conversation.from_dict(d, _client=self._client) - for d in data.get("items", []) - ] diff --git a/src/giskard_hub/resources/evaluations.py b/src/giskard_hub/resources/evaluations.py index 6b8ad7a6..67240900 100644 --- a/src/giskard_hub/resources/evaluations.py +++ b/src/giskard_hub/resources/evaluations.py @@ -73,9 +73,9 @@ def create_local( cast_to=EvaluationRun, ) - def delete(self, execution_id: str | List[str]): + def delete(self, evaluation_id: str | List[str]): return self._client.delete( - "/evaluations", params={"execution_ids": execution_id} + "/evaluations", params={"evaluation_ids": evaluation_id} ) def list(self, project_id: str): diff --git a/src/giskard_hub/resources/scheduled_evaluations.py b/src/giskard_hub/resources/scheduled_evaluations.py index 2e365ecf..b16aae24 100644 --- a/src/giskard_hub/resources/scheduled_evaluations.py +++ b/src/giskard_hub/resources/scheduled_evaluations.py @@ -164,7 +164,7 @@ def create( # pylint: disable=too-many-arguments time : str The time to run the evaluation (HH:MM format). tags : List[str], optional - List of tags to filter the conversations that will be evaluated. + List of tags to filter the chat test cases that will be evaluated. run_count : int, optional The number of times to run each test case (1-5), by default 1. day_of_week : int, optional diff --git a/tests/test_chat_test_cases.py b/tests/test_chat_test_cases.py index 2a1cfdb7..4ebfacbc 100644 --- a/tests/test_chat_test_cases.py +++ b/tests/test_chat_test_cases.py @@ -5,7 +5,6 @@ from giskard_hub.data.chat import ChatMessage, ChatMessageWithMetadata from giskard_hub.data.chat_test_case import ChatTestCase from giskard_hub.data.check import CheckConfig -from giskard_hub.data.conversation import Conversation from giskard_hub.errors import ( HubAPIError, HubValidationError, @@ -522,69 +521,3 @@ def test_chat_test_cases_update_not_found_error(mock_client_with_errors): assert exc_info.value.status_code == 404 assert "Chat test case not found" in exc_info.value.message - - -def test_consistency_between_conversations_and_chat_test_cases(): - dto = { - "id": "conv_123", - "dataset_id": "ds_456", - "messages": [ - { - "role": "user", - "content": "Hello, how are you?", - }, - { - "role": "assistant", - "content": "I'm fine, thank you!", - }, - ], - "demo_output": { - "role": "assistant", - "content": "I'm here to help you.", - "metadata": {"source": "demo"}, - }, - "tags": ["greeting", "test"], - "checks": [], - } - - conversation = Conversation.from_dict(dto) - chat_test_case = ChatTestCase.from_dict(dto) - - # Check messages - assert len(chat_test_case.messages) == len(conversation.messages) - for i in range(len(chat_test_case.messages)): - assert chat_test_case.messages[i].role == conversation.messages[i].role - assert chat_test_case.messages[i].content == conversation.messages[i].content - - # Check demo_output - assert chat_test_case.demo_output.role == conversation.demo_output.role - assert chat_test_case.demo_output.content == conversation.demo_output.content - assert chat_test_case.demo_output.metadata == conversation.demo_output.metadata - assert len(chat_test_case.demo_output.metadata) == len( - conversation.demo_output.metadata - ) - for key in chat_test_case.demo_output.metadata: - assert key in conversation.demo_output.metadata - assert ( - chat_test_case.demo_output.metadata[key] - == conversation.demo_output.metadata[key] - ) - - # Check tags - assert len(chat_test_case.tags) == len(conversation.tags) - for tag in chat_test_case.tags: - assert tag in conversation.tags - - # Check checks - assert chat_test_case.checks == conversation.checks - assert len(chat_test_case.checks) == len(conversation.checks) - for i in range(len(chat_test_case.checks)): - assert chat_test_case.checks[i].identifier == conversation.checks[i].identifier - assert len(chat_test_case.checks[i].assertions) == len( - conversation.checks[i].assertions - ) - - # Check common attributes - assert chat_test_case.id == conversation.id - assert chat_test_case.created_at == conversation.created_at - assert chat_test_case.updated_at == conversation.updated_at diff --git a/tests/test_conversations.py b/tests/test_conversations.py deleted file mode 100644 index ba40dcd5..00000000 --- a/tests/test_conversations.py +++ /dev/null @@ -1,530 +0,0 @@ -from unittest.mock import MagicMock - -import pytest - -from giskard_hub.data.chat import ChatMessage, ChatMessageWithMetadata -from giskard_hub.data.check import CheckConfig -from giskard_hub.data.conversation import Conversation -from giskard_hub.data.dataset import Dataset -from giskard_hub.resources.conversations import ConversationsResource - - -@pytest.fixture -def mock_client(): - mock_client = MagicMock() - - # POST - mock_client.post.side_effect = lambda path, json=None, cast_to=None, **kwargs: ( - Conversation.from_dict(mock_client.post.return_value, _client=mock_client) - if cast_to == Conversation - else mock_client.post.return_value - ) - - # GET - mock_client.get.side_effect = lambda path, cast_to=None, **kwargs: ( - Conversation.from_dict(mock_client.get.return_value, _client=mock_client) - if cast_to == Conversation - else mock_client.get.return_value - ) - - # PATCH - mock_client.patch.side_effect = lambda path, json=None, cast_to=None, **kwargs: ( - Conversation.from_dict(mock_client.patch.return_value, _client=mock_client) - if cast_to == Conversation - else mock_client.patch.return_value - ) - - # DELETE - mock_client.delete.side_effect = lambda path, **kwargs: None - - return mock_client - - -@pytest.fixture -def sample_conversation(): - return Conversation( - messages=[ - ChatMessage(role="user", content="Hello, how can I help you?"), - ChatMessage(role="assistant", content="I need help with my order."), - ChatMessage(role="user", content="What's the issue with your order?"), - ], - demo_output=ChatMessageWithMetadata( - role="assistant", - content="My order #12345 hasn't arrived yet.", - metadata={"order_id": "12345", "issue_type": "delivery"}, - ), - tags=["customer-support", "order-issue"], - checks=[ - CheckConfig( - identifier="correctness", - params={"reference": "I'll help you track your order."}, - ), - CheckConfig( - identifier="conformity", - params={"rules": ["The assistant should be helpful and polite."]}, - ), - ], - ) - - -def test_conversation_create(mock_client): - """Test creating a conversation""" - mock_client.post.return_value = { - "id": "9c065c7d-421f-4fa1-aad3-902587837849", - "created_at": "2025-05-20T09:46:52.424Z", - "updated_at": "2025-05-20T09:46:52.424Z", - "dataset_id": "23868ed8-d12f-40b1-8398-8df4cd066da3", - "messages": [ - {"role": "user", "content": "Hello, how can I help you?"}, - {"role": "assistant", "content": "I need help with my order."}, - {"role": "user", "content": "What's the issue with your order?"}, - ], - "demo_output": { - "role": "assistant", - "content": "My order #12345 hasn't arrived yet.", - "metadata": {"order_id": "12345", "issue_type": "delivery"}, - }, - "tags": ["customer-support", "order-issue"], - "checks": [ - { - "identifier": "correctness", - "assertions": [ - { - "type": "correctness", - "reference": "I'll help you track your order.", - } - ], - "enabled": True, - }, - { - "identifier": "conformity", - "assertions": [ - { - "type": "conformity", - "rules": ["The assistant should be helpful and polite."], - } - ], - "enabled": True, - }, - ], - "comments": [], - } - - messages = [ - ChatMessage(role="user", content="Hello, how can I help you?"), - ChatMessage(role="assistant", content="I need help with my order."), - ChatMessage(role="user", content="What's the issue with your order?"), - ] - - demo_output = ChatMessageWithMetadata( - role="assistant", - content="My order #12345 hasn't arrived yet.", - metadata={"order_id": "12345", "issue_type": "delivery"}, - ) - - checks = [ - CheckConfig( - identifier="correctness", - params={"reference": "I'll help you track your order."}, - ), - CheckConfig( - identifier="conformity", - params={"rules": ["The assistant should be helpful and polite."]}, - ), - ] - - conversations_resource = ConversationsResource(mock_client) - - result = conversations_resource.create( - dataset_id="23868ed8-d12f-40b1-8398-8df4cd066da3", - messages=messages, - demo_output=demo_output, - tags=["customer-support", "order-issue"], - checks=checks, - ) - - assert mock_client.post.called - mock_client.post.assert_called_once() - - assert isinstance(result, Conversation) - assert result.id == "9c065c7d-421f-4fa1-aad3-902587837849" - assert len(result.messages) == 3 - assert result.messages[0].role == "user" - assert result.messages[0].content == "Hello, how can I help you?" - assert result.demo_output.content == "My order #12345 hasn't arrived yet." - assert result.demo_output.metadata == { - "order_id": "12345", - "issue_type": "delivery", - } - assert "customer-support" in result.tags - assert "order-issue" in result.tags - assert len(result.checks) == 2 - assert result.checks[0].identifier == "correctness" - assert result.checks[0].enabled - assert result.checks[0].params == { - "reference": "I'll help you track your order.", - "type": "correctness", - } - assert result.checks[1].identifier == "conformity" - assert result.checks[1].enabled - assert result.checks[1].params == { - "rules": ["The assistant should be helpful and polite."], - "type": "conformity", - } - assert "comments" not in result.to_dict() - - -def test_conversation_retrieve(mock_client): - """Test retrieving a conversation""" - mock_client.get.return_value = { - "id": "9c065c7d-421f-4fa1-aad3-902587837849", - "created_at": "2025-05-20T09:46:52.424Z", - "updated_at": "2025-05-20T09:46:52.424Z", - "dataset_id": "23868ed8-d12f-40b1-8398-8df4cd066da3", - "messages": [ - {"role": "user", "content": "Hello, what is the status of my last order?"} - ], - "demo_output": None, - "tags": ["customer-support"], - "checks": [], - "comments": [], - } - - conversations_resource = ConversationsResource(mock_client) - - result = conversations_resource.retrieve("9c065c7d-421f-4fa1-aad3-902587837849") - - assert mock_client.get.called - mock_client.get.assert_called_once_with( - "/conversations/9c065c7d-421f-4fa1-aad3-902587837849", cast_to=Conversation - ) - - assert isinstance(result, Conversation) - assert result.id == "9c065c7d-421f-4fa1-aad3-902587837849" - assert len(result.messages) == 1 - assert result.messages[0].role == "user" - assert result.messages[0].content == "Hello, what is the status of my last order?" - assert "customer-support" in result.tags - assert len(result.checks) == 0 - assert "comments" not in result.to_dict() - - -def test_conversation_list(mock_client): - """Test listing conversations""" - mock_client.get.return_value = { - "items": [ - { - "id": "9c065c7d-421f-4fa1-aad3-902587837849", - "created_at": "2025-05-20T09:46:52.424Z", - "updated_at": "2025-05-20T09:46:52.424Z", - "dataset_id": "23868ed8-d12f-40b1-8398-8df4cd066da3", - "messages": [{"role": "user", "content": "Hello!"}], - "demo_output": None, - "tags": ["greeting"], - "checks": [], - "comments": [ - { - "created_at": "2025-05-20T10:09:00.853Z", - "updated_at": "2025-05-20T10:09:00.853Z", - "uuid": "3fa85f64-5717-4562-b3fc-2c963f66afa6", - "comment": "string", - "user_id": "3fa85f64-5717-4562-b3fc-2c963f66afa6", - "user_name": "string", - } - ], - }, - { - "id": "87fcaf80-2244-43fe-8e1c-b1ba14ed1f2c", - "created_at": "2025-05-20T09:46:52.424Z", - "updated_at": "2025-05-20T09:46:52.424Z", - "dataset_id": "23868ed8-d12f-40b1-8398-8df4cd066da3", - "messages": [{"role": "user", "content": "Help me with my order."}], - "demo_output": None, - "tags": ["order-issue"], - "checks": [], - "comments": [], - }, - ] - } - - conversations_resource = ConversationsResource(mock_client) - - original_list_method = ConversationsResource.list - - def mock_list(self, dataset_id): - data = self._client.get(f"/datasets/{dataset_id}/conversations?limit=100000") - return [ - Conversation.from_dict(item, _client=self._client) for item in data["items"] - ] - - ConversationsResource.list = mock_list - - try: - results = conversations_resource.list("23868ed8-d12f-40b1-8398-8df4cd066da3") - - assert mock_client.get.called - mock_client.get.assert_called_once_with( - "/datasets/23868ed8-d12f-40b1-8398-8df4cd066da3/conversations?limit=100000" - ) - - assert len(results) == 2 - - assert isinstance(results[0], Conversation) - assert results[0].id == "9c065c7d-421f-4fa1-aad3-902587837849" - assert results[0].messages[0].content == "Hello!" - assert "greeting" in results[0].tags - assert "comments" not in results[0].to_dict() - - assert isinstance(results[1], Conversation) - assert results[1].id == "87fcaf80-2244-43fe-8e1c-b1ba14ed1f2c" - assert "order-issue" in results[1].tags - assert "comments" not in results[1].to_dict() - finally: - ConversationsResource.list = original_list_method - - -def test_conversation_update(mock_client): - """Test updating a conversation""" - mock_client.patch.return_value = { - "id": "9c065c7d-421f-4fa1-aad3-902587837849", - "created_at": "2025-05-20T09:46:52.424Z", - "updated_at": "2025-05-20T09:46:52.424Z", - "dataset_id": "23868ed8-d12f-40b1-8398-8df4cd066da3", - "messages": [{"role": "user", "content": "Updated content!"}], - "demo_output": None, - "tags": ["updated-tag"], - "checks": [ - { - "identifier": "correctness", - "assertions": [ - {"type": "correctness", "reference": "Updated reference"} - ], - "enabled": True, - } - ], - } - - conversations_resource = ConversationsResource(mock_client) - - new_messages = [ChatMessage(role="user", content="Updated content!")] - new_tags = ["updated-tag"] - new_checks = [ - CheckConfig(identifier="correctness", params={"reference": "Updated reference"}) - ] - - result = conversations_resource.update( - conversation_id="9c065c7d-421f-4fa1-aad3-902587837849", - messages=new_messages, - tags=new_tags, - checks=new_checks, - ) - - assert mock_client.patch.called - - assert isinstance(result, Conversation) - assert result.id == "9c065c7d-421f-4fa1-aad3-902587837849" - assert result.messages[0].content == "Updated content!" - assert "updated-tag" in result.tags - assert result.checks[0].identifier == "correctness" - assert result.checks[0].params == { - "reference": "Updated reference", - "type": "correctness", - } - assert result.checks[0].enabled - assert "comments" not in result.to_dict() - - -def test_conversation_delete(mock_client): - """Test deleting a conversation""" - - conversations_resource = ConversationsResource(mock_client) - - conversations_resource.delete("9c065c7d-421f-4fa1-aad3-902587837849") - - mock_client.delete.assert_called_once_with( - "/conversations", - params={"conversation_ids": "9c065c7d-421f-4fa1-aad3-902587837849"}, - ) - - mock_client.reset_mock() - - conversations_resource.delete( - [ - "9c065c7d-421f-4fa1-aad3-902587837849", - "87fcaf80-2244-43fe-8e1c-b1ba14ed1f2c", - ] - ) - - mock_client.delete.assert_called_once_with( - "/conversations", - params={ - "conversation_ids": [ - "9c065c7d-421f-4fa1-aad3-902587837849", - "87fcaf80-2244-43fe-8e1c-b1ba14ed1f2c", - ] - }, - ) - - -def test_conversation_from_dict(): - """Test creating a Conversation from a dictionary""" - data = { - "id": "9c065c7d-421f-4fa1-aad3-902587837849", - "messages": [ - {"role": "user", "content": "Hello"}, - {"role": "assistant", "content": "Hi there"}, - ], - "demo_output": { - "role": "assistant", - "content": "How can I help you?", - "metadata": {"intent": "greeting"}, - }, - "tags": ["test-tag"], - "checks": [ - { - "identifier": "correctness", - "assertions": [{"type": "correctness", "reference": "Hello world"}], - "enabled": True, - } - ], - } - - conversation = Conversation.from_dict(data) - - assert conversation.id == "9c065c7d-421f-4fa1-aad3-902587837849" - assert len(conversation.messages) == 2 - assert conversation.messages[0].role == "user" - assert conversation.messages[0].content == "Hello" - assert conversation.demo_output.role == "assistant" - assert conversation.demo_output.content == "How can I help you?" - assert conversation.demo_output.metadata == {"intent": "greeting"} - assert "test-tag" in conversation.tags - assert conversation.checks[0].identifier == "correctness" - assert conversation.checks[0].params == { - "reference": "Hello world", - "type": "correctness", - } - assert "dataset_id" not in conversation.to_dict() - - -def test_conversation_to_dict(): - """Test converting a Conversation to a dictionary""" - data = { - "id": "9c065c7d-421f-4fa1-aad3-902587837849", - "dataset_id": "23868ed8-d12f-40b1-8398-8df4cd066da3", - "created_at": "2025-05-20T09:46:52.424Z", - "updated_at": "2025-05-20T09:46:52.424Z", - "messages": [ - {"role": "user", "content": "Hello"}, - {"role": "assistant", "content": "Hi there"}, - ], - "demo_output": { - "role": "assistant", - "content": "How can I help you?", - "metadata": {"intent": "greeting"}, - }, - "tags": ["test-tag"], - "checks": [ - { - "identifier": "correctness", - "assertions": [{"type": "correctness", "reference": "Hello world"}], - "enabled": True, - } - ], - } - conversation = Conversation.from_dict(data) - - result = conversation.to_dict() - - assert result["id"] == "9c065c7d-421f-4fa1-aad3-902587837849" - assert len(result["messages"]) == 2 - assert result["messages"][0]["role"] == "user" - assert result["messages"][0]["content"] == "Hello" - assert result["demo_output"]["role"] == "assistant" - assert result["demo_output"]["content"] == "How can I help you?" - assert result["demo_output"]["metadata"] == {"intent": "greeting"} - assert "test-tag" in result["tags"] - assert result["checks"][0]["identifier"] == "correctness" - assert result["checks"][0]["params"] == { - "reference": "Hello world", - "type": "correctness", - } - assert "dataset_id" not in result - assert "comments" not in result - - -def test_dataset_conversations_property(mock_client): - """Test accessing conversations through dataset property""" - conversations_list = [ - Conversation.from_dict( - { - "id": "9c065c7d-421f-4fa1-aad3-902587837849", - "messages": [{"role": "user", "content": "Hello!"}], - "tags": ["greeting"], - }, - _client=mock_client, - ) - ] - - dataset = Dataset(name="Test Dataset") - dataset.id = "23868ed8-d12f-40b1-8398-8df4cd066da3" - - mock_conversations = MagicMock(spec=ConversationsResource) - mock_conversations.list.return_value = conversations_list - - mock_client.conversations = mock_conversations - dataset._client = mock_client - - result = dataset.conversations - - mock_conversations.list.assert_called_once_with( - dataset_id="23868ed8-d12f-40b1-8398-8df4cd066da3" - ) - assert result == conversations_list - - -def test_create_conversation_through_dataset(mock_client): - """Test creating a conversation through dataset method""" - created_conversation = Conversation.from_dict( - { - "id": "9c065c7d-421f-4fa1-aad3-902587837849", - "messages": [{"role": "user", "content": "Hello!"}], - "tags": ["greeting"], - }, - _client=mock_client, - ) - - dataset = Dataset(name="Test Dataset") - dataset.id = "23868ed8-d12f-40b1-8398-8df4cd066da3" - - mock_conversations = MagicMock(spec=ConversationsResource) - mock_conversations.create.return_value = created_conversation - - mock_client.conversations = mock_conversations - dataset._client = mock_client - - conversation = Conversation( - messages=[ChatMessage(role="user", content="Hello!")], tags=["greeting"] - ) - - result = dataset.create_conversation(conversation) - - mock_conversations.create.assert_called_once() - call_kwargs = mock_conversations.create.call_args.kwargs - assert "dataset_id" in call_kwargs - assert call_kwargs["dataset_id"] == "23868ed8-d12f-40b1-8398-8df4cd066da3" - assert result == created_conversation - - -def test_conversation_without_client(): - """Test that trying to access conversation methods without a client raises errors""" - dataset = Dataset(name="Test Dataset") # No client, no ID - - assert dataset.conversations is None - - conversation = Conversation( - messages=[ChatMessage(role="user", content="Hello!")], - ) - - with pytest.raises(ValueError): - dataset.create_conversation(conversation) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 3de8ed4a..cce4af0e 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -440,66 +440,6 @@ def test_dataset_chat_test_cases_property_without_id(self): assert result is None - def test_dataset_conversations_property_deprecation_warning(self): - """Test that conversations property shows deprecation warning.""" - mock_client = MagicMock() - mock_client.conversations.list.return_value = ["conversation_1"] - - dataset = Dataset.from_dict( - { - "id": "dataset-1", - "name": "Test Dataset", - }, - _client=mock_client, - ) - - with pytest.warns(DeprecationWarning, match="Conversation is deprecated"): - result = dataset.conversations - - mock_client.conversations.list.assert_called_once_with(dataset_id="dataset-1") - assert result == ["conversation_1"] - - def test_dataset_create_conversation_deprecation_warning(self): - """Test that create_conversation shows deprecation warning.""" - from giskard_hub.data.chat import ChatMessage - from giskard_hub.data.conversation import Conversation - - mock_client = MagicMock() - mock_client.conversations.create.return_value = "created_conversation" - - dataset = Dataset.from_dict( - { - "id": "dataset-1", - "name": "Test Dataset", - }, - _client=mock_client, - ) - - conversation = Conversation( - messages=[ChatMessage(role="user", content="Hello")], - demo_output=None, - ) - - with pytest.warns(DeprecationWarning, match="Conversation is deprecated"): - result = dataset.create_conversation(conversation) - - mock_client.conversations.create.assert_called_once() - assert result == "created_conversation" - - def test_dataset_create_conversation_without_client(self): - """Test create_conversation without client raises error.""" - from giskard_hub.data.chat import ChatMessage - from giskard_hub.data.conversation import Conversation - - dataset = Dataset(name="Test Dataset") - conversation = Conversation( - messages=[ChatMessage(role="user", content="Hello")], - demo_output=None, - ) - - with pytest.raises(ValueError, match="detached or unsaved"): - dataset.create_conversation(conversation) - def test_dataset_create_chat_test_case_with_client(self): """Test create_chat_test_case with a client.""" from giskard_hub.data.chat import ChatMessage diff --git a/tests/test_evaluations.py b/tests/test_evaluations.py index 2d15b7a4..207d7255 100644 --- a/tests/test_evaluations.py +++ b/tests/test_evaluations.py @@ -3,7 +3,6 @@ import pytest from giskard_hub.data.chat_test_case import ChatTestCase -from giskard_hub.data.conversation import Conversation from giskard_hub.data.evaluation import EvaluationEntry, EvaluationRun, EvaluatorResult from giskard_hub.data.model import Model, ModelOutput from giskard_hub.data.task import TaskStatus @@ -58,7 +57,7 @@ TEST_EVALUATION_ENTRY_DATA = { "id": "entry_123", "run_id": "run_123", - "conversation": TEST_CONVERSATION_DATA, + "chat_test_case": TEST_CONVERSATION_DATA, "results": [], "status": "COMPLETED", "model_output": { @@ -102,7 +101,7 @@ TEST_EVALUATION_ENTRY_WITH_FAILURE_CATEGORY = { "id": "entry_with_failure", "run_id": "run_123", - "conversation": TEST_CONVERSATION_DATA, + "chat_test_case": TEST_CONVERSATION_DATA, "results": [], "status": "FAILED", "model_output": { @@ -141,26 +140,8 @@ def test_evaluation_entry_from_chat_test_case(): } ) - assert isinstance(evaluation_entry.conversation, ChatTestCase) - assert evaluation_entry.conversation.id == chat_test_case.id - - -def test_evaluation_entry_from_conversation(): - conversation_data = TEST_CONVERSATION_DATA.copy() - - conversation = Conversation.from_dict(conversation_data) - evaluation_entry = EvaluationEntry.from_dict( - { - "conversation": conversation_data, - "run_id": "run_123", - "results": [], - "status": TaskStatus.RUNNING, - "model_output": None, - } - ) - - assert isinstance(evaluation_entry.conversation, Conversation) - assert evaluation_entry.conversation.id == conversation.id + assert isinstance(evaluation_entry.chat_test_case, ChatTestCase) + assert evaluation_entry.chat_test_case.id == chat_test_case.id # Tests for EvaluationsResource.retrieve() @@ -332,34 +313,36 @@ def test_create_local_validation_error(evaluations_resource, mock_client): # Tests for EvaluationsResource.delete() -def test_delete_single_execution_id(evaluations_resource, mock_client): - """Test deletion with single execution ID.""" +def test_delete_single_evaluation_id(evaluations_resource, mock_client): + """Test deletion with single evaluation ID.""" mock_client.delete.return_value = {"success": True} result = evaluations_resource.delete("exec_123") mock_client.delete.assert_called_once_with( - "/evaluations", params={"execution_ids": "exec_123"} + "/evaluations", params={"evaluation_ids": "exec_123"} ) assert result == {"success": True} -def test_delete_multiple_execution_ids(evaluations_resource, mock_client): - """Test deletion with multiple execution IDs.""" +def test_delete_multiple_evaluation_ids(evaluations_resource, mock_client): + """Test deletion with multiple evaluation IDs.""" mock_client.delete.return_value = {"success": True} - execution_ids = ["exec_123", "exec_456", "exec_789"] + evaluation_ids = ["exec_123", "exec_456", "exec_789"] - result = evaluations_resource.delete(execution_ids) + result = evaluations_resource.delete(evaluation_ids) mock_client.delete.assert_called_once_with( - "/evaluations", params={"execution_ids": execution_ids} + "/evaluations", params={"evaluation_ids": evaluation_ids} ) assert result == {"success": True} def test_delete_not_found(evaluations_resource, mock_client): - """Test delete with non-existent execution ID.""" - mock_client.delete.side_effect = HubAPIError("Execution not found", status_code=404) + """Test delete with non-existent evaluation ID.""" + mock_client.delete.side_effect = HubAPIError( + "Evaluation not found", status_code=404 + ) with pytest.raises(HubAPIError) as exc_info: evaluations_resource.delete("nonexistent_exec") diff --git a/uv.lock b/uv.lock index df7e1872..a36cf0a0 100644 --- a/uv.lock +++ b/uv.lock @@ -1460,7 +1460,7 @@ llm = [ [[package]] name = "giskard-hub" -version = "1.2.5" +version = "1.2.7" source = { editable = "." } dependencies = [ { name = "httpx" },