Giskard-AI · davidberenstein1957 · Aug 28, 2025 · Aug 27, 2025 · Aug 28, 2025 · Aug 28, 2025
diff --git a/.github/workflows/update-docs.yml b/.github/workflows/update-docs.yml
@@ -33,7 +33,7 @@ jobs:
         id: setup-python
         uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5
         with:
-          python-version: "3.13"
+          python-version: "3.12"
 
       # REQUIRED BY NBSphinx
       - name: Install Pandoc

diff --git a/Makefile b/Makefile
@@ -24,6 +24,7 @@ check_format: ## Verify code formatting
 .PHONY: check_format
 
 setup: ## Install dependencies
+	poetry lock
 	poetry sync
 .PHONY: setup
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,29 +17,33 @@ python-dateutil = "^2.9.0.post0"
 
 [tool.poetry.group.dev.dependencies]
 sphinxawesome-theme = "5.2.0"
-myst-parser = "3.0.1"
+myst-parser = ">=1.0.0"
 notebook = "7.4.4"
-nbsphinx = "0.9.7"
+nbsphinx = ">=0.9.2"
 black = {extras = ["jupyter"], version = "25.1.0"}
 isort = "6.0.1"
 pylint = "3.3.7"
-sphinx-click = "6.0.0"
-sphinx-autobuild = "2024.10.3"
+sphinx-click = ">=4.4.0"
+sphinx-autobuild = ">=2021.3.14"
 sphinx-autodoc-typehints = "2.3.0"
-sphinx-design = "0.6.1"
-sphinx-tabs = "^3.4.7"
+sphinx-design = ">=0.4.1"
+sphinx-tabs = ">=3.4.1"
 pytest = "8.4.1"
 # unified documentation for giskard
-giskard = {extras = ["llm"], version = "^2.0.0", python = ">=3.9,<3.13"}
+giskard = {extras = ["llm", "dev"], version = "^2.0.0", python = ">=3.9,<3.13"}
 pyarrow = "<20.0.0"
 ragas = "0.1.5, <=0.2.7"
 ipywidgets = "^8.1.7"
 sphinxext-opengraph = {version = "^0.12.0", extras = ["social_cards"]}
 pandoc = "^2.4"
-torch = [
-    {markers = "platform_system == \"Darwin\"", version = "^2.8.0"},
-    {markers = "platform_system != \"Darwin\"", version = "^2.8.0", source = "pytorch_cpu"}
-]
+sphinx = ">=6.1.3"
+sphinxcontrib-napoleon = ">=0.7"
+sphinx-autoapi = ">=2.1.0"
+sphinx-rtd-theme = ">=1.2.0"
+sphinx-copybutton = ">=0.5.2"
+ipython = "==8.12.0"
+scrapy = "*"
+requests = "*"
 
 [build-system]
 requires = ["poetry-core"]

diff --git a/script-docs/_static/custom.css b/script-docs/_static/custom.css
@@ -133,7 +133,8 @@ html.dark body,
 .dark .sd-card,
 .dark aside#left-sidebar,
 .dark header,
-.dark footer {
+.dark footer,
+.dark pre {
   border-color: #40DEDF21 !important;
 }
 
@@ -406,7 +407,6 @@ body[data-content_root="./"] header nav a[class*="text-foreground"] {
   color: #C6FFFF !important; 
 }
 
-=======
 /* Sphinx Design Tabs Styling */
 .sd-tab-set {
   margin: 1rem 0;
@@ -457,3 +457,60 @@ body[data-content_root="./"] header nav a[class*="text-foreground"] {
   display: block;
 }
 
+/* Dark mode fixes for notebook content visibility */
+.dark div.nboutput.container div.output_area,
+.dark div.nbinput.container div.input_area,
+.dark div.rendered_html,
+.dark .jp-RenderedHTMLCommon {
+  color: #C6FFFF !important;
+  background: transparent !important;
+  visibility: visible !important;
+  opacity: 1 !important;
+}
+
+.dark div.nboutput.container div.output_area > *,
+.dark div.nbinput.container div.input_area > *,
+.dark div.rendered_html *,
+.dark .jp-RenderedHTMLCommon * {
+  color: #C6FFFF !important;
+  visibility: visible !important;
+}
+
+/* Ensure notebook text content is visible */
+.dark div.nboutput.container div.output_area pre,
+.dark div.nbinput.container div.input_area pre,
+.dark div.nboutput.container .highlight,
+.dark div.nbinput.container .highlight,
+.dark div.output_text,
+.dark div.output_html {
+  color: #C6FFFF !important;
+  background: transparent !important;
+}
+
+/* Dark mode colors */
+.dark div.nboutput.container,
+.dark div.nbinput.container,
+.dark div.nboutput.container div.output_area,
+.dark div.nbinput.container div.input_area,
+.dark div.rendered_html,
+.dark .jp-RenderedHTMLCommon {
+  color: #C6FFFF !important;
+}
+
+
+/* Fix table colors in dark mode */
+.dark .jp-RenderedHTMLCommon table,
+.dark div.rendered_html table {
+  color: #C6FFFF !important;
+}
+
+.dark .jp-RenderedHTMLCommon td,
+.dark .jp-RenderedHTMLCommon th,
+.dark div.rendered_html td,
+.dark div.rendered_html th {
+  color: #C6FFFF !important;
+}
+
+.nboutput.nblast.docutils.container{
+  display: block !important;
+}
diff --git a/script-docs/conf.py b/script-docs/conf.py
@@ -10,14 +10,15 @@
 import os
 import sys
 from dataclasses import asdict
+from datetime import datetime
 
 from sphinxawesome_theme import ThemeOptions
 from sphinxawesome_theme.postprocess import Icons
 
 html_permalinks_icon = Icons.permalinks_icon
 
 project = "Giskard"
-copyright = "2025, Giskard"
+copyright = f"{datetime.now().year}, Giskard"
 author = "Giskard"
 
 # -- General configuration ---------------------------------------------------
@@ -41,6 +42,22 @@
     # "sphinx_autodoc_typehints",
 ]
 
+myst_enable_extensions = [
+    "amsmath",
+    "attrs_inline",
+    "colon_fence",
+    "deflist",
+    "dollarmath",
+    "fieldlist",
+    "html_admonition",
+    "html_image",
+    "replacements",
+    "smartquotes",
+    "strikethrough",
+    "substitution",
+    "tasklist",
+]
+
 # Resolve Dataset cross-reference ambiguity
 autodoc_type_aliases = {
     "Dataset": "giskard.Dataset",
@@ -71,6 +88,8 @@
 else:
     branch = docs_version.replace("-", "/")
 branch = "main"
+
+# -- Options for nbsphinx ----------------------------------------------------
 nbsphinx_execute = "never"
 # fmt: off
 nbsphinx_prolog = """

diff --git a/script-docs/hub/sdk/evaluations.rst b/script-docs/hub/sdk/evaluations.rst
@@ -65,7 +65,6 @@ We can configure the agent endpoint in the Hub:
         headers={"X-API-Key": "SECRET_TOKEN"},
     )
 
-
 You can test that everything is working by sending a test request to the agent
 
 .. code-block:: python
@@ -78,9 +77,9 @@ You can test that everything is working by sending a test request to the agent
     print(response)
     # ModelOutput(message=ChatMessage(role='assistant', content='It is sunny!'))
 
+Create a evaluation
+___________________
 
-Run an evaluation
-_________________
 
 Now that the agent is configured, we can launch an evaluation run. We first need
 to know which dataset we will run the evaluation on. If you are running this in
@@ -94,14 +93,15 @@ We can now launch the evaluation run:
 
 .. code-block:: python
 
-    eval_run = hub.evaluate(
-        model=model.id,
-        dataset=dataset_id
-        # optionally, specify a name
+    eval_run = hub.evaluations.create(
+        model_id=model.id,
+        dataset_id=dataset_id,
+        # optionally,
+        tags=["staging", "build"],
+        run_count=1, # number of runs per case
         name="staging-build-a4f321",
     )
 
-
 The evaluation run will be queued and processed by the Hub. The ``evaluate``
 method will immediately return an :class:`~giskard_hub.data.EvaluationRun` object
 while the evaluation is running. Note however that this object will not contain
@@ -117,7 +117,6 @@ You can wait until the evaluation run has finished running with the
         timeout=600
     )
 
-
 This will block until the evaluation is completed and update the ``eval_run``
 object in-place. The method will wait for up to 10 minutes for the
 evaluation to complete. If the evaluation takes longer, the method will raise a
@@ -159,19 +158,18 @@ For example:
             print(f"FAILED: {metric.name} is below 90%.")
             sys.exit(1)
 
-
-
 That covers the basics of running evaluations in the Hub. You can now integrate
 this code in your CI/CD pipeline to automatically evaluate your agents every
 time you deploy a new version.
 
-.. note:: If you want to run evaluations on a local model that is not yet
-    exposed with an API, check :ref:`local-evaluation`.
+.. note::
+
+    If you want to run evaluations on a local model that is not yet exposed with an API, check :ref:`local-evaluation`.
 
 Compare evaluations
 ___________________
 
-After running evaluations, you can compare them to see if there are any regressions. We do not offer a built-in comparison tool in the SDK, but you can :ref:`use the Hub UI to compare evaluations <compare-evaluations>`.
+After running evaluations, you can compare them to see if there are any regressions. We do not offer a built-in comparison tool in the SDK, but you can :ref:`use the Hub UI to compare evaluations <hub/ui/evaluations-compare>`.
 
 .. _local-evaluation:
 
@@ -240,8 +238,8 @@ You can check that everything works simply by running the function:
     my_local_agent([ChatMessage(role="user", content="Hello")])
     # Output: "You said: 'Hello'"
 
-Run an evaluation
-_________________
+Create a local evaluation
+_________________________
 
 Running the evaluation is similar to what we have seen for remote evaluations. Instead of passing a remote model ID to the
 ``evaluate`` method of the Hub client, we will pass the function we defined
@@ -258,9 +256,9 @@ We can now launch the evaluation run:
 
 .. code-block:: python
 
-    eval_run = hub.evaluate(
+    eval_run = hub.evaluations.create_local(
         model=my_local_agent,
-        dataset=dataset_id,
+        dataset_id=dataset_id,
         # optionally, specify a name
         name="test-run",
     )
@@ -276,7 +274,6 @@ the evaluation run to complete and then print the results:
     # Print the metrics
     eval_run.print_metrics()
 
-
 .. figure:: /_static/images/cli/metrics_output.png
     :alt: Evaluation metrics output
 
@@ -285,5 +282,72 @@ the evaluation run to complete and then print the results:
 You can also check the results in the Hub interface and compare it with other
 evaluation runs.
 
-.. hint::  You may also want to use this method in your CI/CD pipeline, to
-    perform checks when the code or the prompts of your agent get updated.
+.. hint::
+
+    You may also want to use this method in your CI/CD pipeline, to perform checks when the code or the prompts of your agent get updated.
+
+Evaluations
+~~~~~~~~~~~
+
+Create an evaluation
+--------------------
+
+You can create a new evaluation using the ``hub.evaluations.create()`` method.
+
+.. code-block:: python
+
+    eval_run = hub.evaluations.create(
+        model_id=model.id,
+        dataset_id=dataset.id,
+        tags=["nightly", "regression"],
+        run_count=1,
+        name="nightly-regression-1"
+    )
+
+Retrieve an evaluation
+----------------------
+
+You can retrieve an evaluation using the ``hub.evaluations.retrieve()`` method.
+
+.. code-block:: python
+
+    eval_run = hub.evaluations.retrieve(eval_run.id)
+
+Update an evaluation
+--------------------
+
+You can update an evaluation using the ``hub.evaluations.update()`` method.
+
+.. code-block:: python
+
+    eval_run = hub.evaluations.update(eval_run.id, tags=["staging", "build"])
+
+Delete an evaluation
+--------------------
+
+You can delete an evaluation using the ``hub.evaluations.delete()`` method.
+
+.. code-block:: python
+
+    hub.evaluations.delete(eval_run.id)
+
+List evaluations
+----------------
+
+You can list evaluations using the ``hub.evaluations.list()`` method.
+
+.. code-block:: python
+
+    eval_runs = hub.evaluations.list(project_id=project_id)
+
+List evaluation results
+-----------------------
+
+You can list evaluation results using the ``hub.evaluations.list_entries()`` method.
+
+.. code-block:: python
+
+    eval_results = hub.evaluations.list_entries(eval_run.id)
+
+.. note::
+    As of now, the Giskard Hub SDK does not support scheduled evaluations but you can use the `Giskard Hub UI </hub/ui/evaluations>`_ to schedule evaluations.
diff --git a/script-docs/hub/sdk/index.rst b/script-docs/hub/sdk/index.rst
@@ -8,19 +8,19 @@ The Giskard Hub SDK provides a Python interface to interact with the Giskard Hub
 
 .. grid:: 1 1 2 2
 
-   .. grid-item-card:: Manage Projects
+   .. grid-item-card:: Manage projects and agents
       :link: projects
       :link-type: doc
 
-      Create, update, and organize projects
+      Create, update, and organize projects and agents
 
-   .. grid-item-card:: Manage Datasets and Conversations
+   .. grid-item-card:: Manage datasets and conversations
       :link: datasets/index
       :link-type: doc
 
       Create, update, and organize test datasets and conversations manually or using synthetic data generation
 
-   .. grid-item-card:: Manage Checks
+   .. grid-item-card:: Manage checks
       :link: checks
       :link-type: doc