Giskard-AI · kevinmessiaen · Feb 5, 2024 · Dec 21, 2023 · Dec 21, 2023 · Dec 21, 2023
diff --git a/giskard/llm/client/__init__.py b/giskard/llm/client/__init__.py
@@ -3,7 +3,7 @@
 import logging
 import os
 
-from .base import LLMClient, LLMFunctionCall, LLMOutput
+from .base import LLMClient, LLMFunctionCall, LLMMessage, LLMToolCall
 from .logger import LLMLogger
 
 _default_client = None
@@ -75,7 +75,8 @@ def get_default_client() -> LLMClient:
 __all__ = [
     "LLMClient",
     "LLMFunctionCall",
-    "LLMOutput",
+    "LLMToolCall",
+    "LLMMessage",
     "LLMLogger",
     "get_default_client",
     "set_llm_model",

diff --git a/giskard/llm/client/base.py b/giskard/llm/client/base.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional, Sequence
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
@@ -8,14 +8,27 @@
 
 @dataclass
 class LLMFunctionCall:
-    function: str
-    args: Any
+    name: str
+    arguments: Any
 
 
 @dataclass
-class LLMOutput:
-    message: Optional[str] = None
-    function_call: Optional[LLMFunctionCall] = None
+class LLMToolCall:
+    id: str
+    type: str
+    function: LLMFunctionCall
+
+
+@dataclass
+class LLMMessage:
+    role: str
+    content: Optional[str]
+    function_call: Optional[LLMFunctionCall]
+    tool_calls: Optional[List[LLMToolCall]]
+
+    @staticmethod
+    def create_message(role: str, content: str):
+        return LLMMessage(role=role, content=content, function_call=None, tool_calls=None)
 
 
 class LLMClient(ABC):
@@ -27,11 +40,13 @@ def logger(self) -> LLMLogger:
     @abstractmethod
     def complete(
         self,
-        messages,
+        messages: Sequence[LLMMessage],
         functions=None,
         temperature=0.5,
         max_tokens=None,
         function_call: Optional[Dict] = None,
         caller_id: Optional[str] = None,
-    ) -> LLMOutput:
+        tools=None,
+        tool_choice=None,
+    ) -> LLMMessage:
         ...
diff --git a/giskard/llm/client/openai.py b/giskard/llm/client/openai.py
@@ -1,4 +1,4 @@
-from typing import Dict, Optional, Sequence
+from typing import Dict, List, Optional, Sequence
 
 import json
 from abc import ABC, abstractmethod
@@ -7,7 +7,8 @@
 
 from ..config import LLMConfigurationError
 from ..errors import LLMGenerationError, LLMImportError
-from . import LLMClient, LLMFunctionCall, LLMLogger, LLMOutput
+from . import LLMClient, LLMFunctionCall, LLMLogger, LLMMessage
+from .base import LLMToolCall
 
 try:
     import openai
@@ -38,39 +39,101 @@ def _completion(
         function_call: Optional[Dict] = None,
         max_tokens=None,
         caller_id: Optional[str] = None,
+        tools=None,
+        tool_choice=None,
     ) -> dict:
         ...
 
+    @staticmethod
+    def _serialize_function_call(function_call: LLMFunctionCall) -> Dict:
+        return {"name": function_call.name, "arguments": json.dumps(function_call.arguments)}
+
+    @staticmethod
+    def _serialize_tool_call(tool_call: LLMToolCall) -> Dict:
+        return {
+            "id": tool_call.id,
+            "type": tool_call.type,
+            "function": BaseOpenAIClient._serialize_function_call(tool_call.function),
+        }
+
+    @staticmethod
+    def _serialize_tool_calls(tool_calls: List[LLMToolCall]) -> List[Dict]:
+        return [BaseOpenAIClient._serialize_tool_call(tool_call) for tool_call in tool_calls]
+
+    @staticmethod
+    def _serialize_message(response: LLMMessage) -> Dict:
+        result = {
+            "role": response.role,
+            "content": response.content,
+            "function_call": BaseOpenAIClient._serialize_function_call(response.function_call)
+            if response.function_call
+            else None,
+            "tool_calls": BaseOpenAIClient._serialize_tool_calls(response.tool_calls) if response.tool_calls else None,
+        }
+
+        return {key: value for key, value in result.items() if value is not None}
+
+    @staticmethod
+    def _parse_function_call(function_call) -> LLMFunctionCall:
+        try:
+            return LLMFunctionCall(
+                name=function_call["name"],
+                arguments=json.loads(function_call["arguments"]),
+            )
+        except (json.JSONDecodeError, KeyError) as err:
+            raise LLMGenerationError("Could not parse function call") from err
+
+    @staticmethod
+    def _parse_tool_call(tool_call) -> LLMToolCall:
+        return LLMToolCall(
+            id=tool_call["id"],
+            type=tool_call["type"],
+            function=BaseOpenAIClient._parse_function_call(tool_call["function"]),
+        )
+
+    @staticmethod
+    def _parse_tool_calls(tool_calls) -> List[LLMToolCall]:
+        return [BaseOpenAIClient._parse_tool_call(tool_call) for tool_call in tool_calls]
+
+    @staticmethod
+    def _parse_message(response) -> LLMMessage:
+        return LLMMessage(
+            role=response["role"],
+            content=response["content"],
+            function_call=BaseOpenAIClient._parse_function_call(response["function_call"])
+            if "function_call" in response and response["function_call"] is not None
+            else None,
+            tool_calls=BaseOpenAIClient._parse_tool_calls(response["tool_calls"])
+            if "tool_calls" in response and response["tool_calls"] is not None
+            else None,
+        )
+
     def complete(
         self,
-        messages,
+        messages: Sequence[LLMMessage],
         functions=None,
         temperature=0.5,
         max_tokens=None,
         function_call: Optional[Dict] = None,
         caller_id: Optional[str] = None,
+        tools=None,
+        tool_choice=None,
     ):
-        cc = self._completion(
-            messages=messages,
+        llm_message = self._completion(
+            messages=[
+                BaseOpenAIClient._serialize_message(message) if isinstance(message, LLMMessage) else message
+                for message in messages
+            ],
             temperature=temperature,
             functions=functions,
             function_call=function_call,
             max_tokens=max_tokens,
             caller_id=caller_id,
+            tools=tools,
+            tool_choice=tool_choice,
         )
 
-        function_call = None
-
-        if fc := cc.get("function_call"):
-            try:
-                function_call = LLMFunctionCall(
-                    function=fc["name"],
-                    args=json.loads(fc["arguments"], strict=False),
-                )
-            except (json.JSONDecodeError, KeyError) as err:
-                raise LLMGenerationError("Could not parse function call") from err
-
-        return LLMOutput(message=cc["content"], function_call=function_call)
+        return BaseOpenAIClient._parse_message(llm_message)
 
 
 class LegacyOpenAIClient(BaseOpenAIClient):
@@ -90,17 +153,26 @@ def _completion(
         function_call: Optional[Dict] = None,
         max_tokens=None,
         caller_id: Optional[str] = None,
+        tools=None,
+        tool_choice=None,
     ):
         extra_params = dict()
         if function_call is not None:
             extra_params["function_call"] = function_call
         if functions is not None:
             extra_params["functions"] = functions
+        if tools is not None:
+            extra_params["tools"] = tools
+        if tool_choice is not None:
+            extra_params["tool_choice"] = tool_choice
 
         try:
             completion = openai.ChatCompletion.create(
                 model=self.model,
-                messages=messages,
+                messages=[
+                    BaseOpenAIClient._serialize_message(message) if isinstance(message, LLMMessage) else message
+                    for message in messages
+                ],
                 temperature=temperature,
                 max_tokens=max_tokens,
                 **extra_params,
@@ -135,12 +207,18 @@ def _completion(
         function_call: Optional[Dict] = None,
         max_tokens=None,
         caller_id: Optional[str] = None,
+        tools=None,
+        tool_choice=None,
     ):
         extra_params = dict()
         if function_call is not None:
             extra_params["function_call"] = function_call
         if functions is not None:
             extra_params["functions"] = functions
+        if tools is not None:
+            extra_params["tools"] = tools
+        if tool_choice is not None:
+            extra_params["tool_choice"] = tool_choice
 
         try:
             completion = self._client.chat.completions.create(

diff --git a/giskard/llm/evaluators/base.py b/giskard/llm/evaluators/base.py
@@ -11,22 +11,25 @@
 
 EVALUATE_MODEL_FUNCTIONS = [
     {
-        "name": "evaluate_model",
-        "description": "Evaluates if the model passes the test",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "passed_test": {
-                    "type": "boolean",
-                    "description": "true if the model successfully passes the test",
-                },
-                "reason": {
-                    "type": "string",
-                    "description": "optional short description of why the model does not pass the test, in 1 or 2 short sentences",
+        "type": "function",
+        "function": {
+            "name": "evaluate_model",
+            "description": "Evaluates if the model passes the test",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "passed_test": {
+                        "type": "boolean",
+                        "description": "true if the model successfully passes the test",
+                    },
+                    "reason": {
+                        "type": "string",
+                        "description": "optional short description of why the model does not pass the test, in 1 or 2 short sentences",
+                    },
                 },
+                "required": ["passed_test"],
             },
         },
-        "required": ["passed_test"],
     },
 ]
 
@@ -101,20 +104,20 @@ def evaluate(self, model: BaseModel, dataset: Dataset):
             try:
                 out = self.llm_client.complete(
                     [{"role": "system", "content": prompt}],
-                    functions=funcs,
-                    function_call={"name": "evaluate_model"},
+                    tools=funcs,
+                    tool_choice={"type": "function", "function": {"name": "evaluate_model"}},
                     temperature=self.llm_temperature,
                     caller_id=self.__class__.__name__,
                 )
-                if out.function_call is None or "passed_test" not in out.function_call.args:
+                if len(out.tool_calls) != 1 or "passed_test" not in out.tool_calls[0].function.arguments:
                     raise LLMGenerationError("Invalid function call arguments received")
             except LLMGenerationError as err:
                 status.append(TestResultStatus.ERROR)
                 reasons.append(str(err))
                 errored.append({"message": str(err), "sample": sample})
                 continue
 
-            args = out.function_call.args
+            args = out.tool_calls[0].function.arguments
             reasons.append(args.get("reason"))
             if args["passed_test"]:
                 status.append(TestResultStatus.PASSED)

diff --git a/giskard/llm/evaluators/coherency.py b/giskard/llm/evaluators/coherency.py
@@ -98,13 +98,13 @@ def _eval_pair(self, model: BaseModel, input_1, input_2, output_1, output_2):
 
         out = self.llm_client.complete(
             [{"role": "system", "content": prompt}],
-            functions=EVALUATE_MODEL_FUNCTIONS,
-            function_call={"name": "evaluate_model"},  # force function call
+            tools=EVALUATE_MODEL_FUNCTIONS,
+            tool_choice={"type": "function", "function": {"name": "evaluate_model"}},  # force tool call
             temperature=self.llm_temperature,
             caller_id=self.__class__.__name__,
         )
 
-        if out.function_call is None or "passed_test" not in out.function_call.args:
+        if len(out.tool_calls) != 1 or "passed_test" not in out.tool_calls[0].function.arguments:
             raise LLMGenerationError("Invalid function call arguments received")
 
-        return out.function_call.args["passed_test"], out.function_call.args.get("reason")
+        return out.tool_calls[0].function.arguments["passed_test"], out.tool_calls[0].function.arguments.get("reason")
diff --git a/giskard/llm/generators/base.py b/giskard/llm/generators/base.py
@@ -63,20 +63,23 @@ def _make_generate_input_prompt(self, model: BaseModel, num_samples: int):
     def _make_generate_input_functions(self, model: BaseModel, num_samples: int):
         return [
             {
-                "name": "generate_inputs",
-                "description": "generates inputs for model audit",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "inputs": {
-                            "type": "array",
-                            "items": {
-                                "type": "object",
-                                "properties": {name: {"type": "string"} for name in model.feature_names},
-                            },
-                        }
+                "type": "function",
+                "function": {
+                    "name": "generate_inputs",
+                    "description": "generates inputs for model audit",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "inputs": {
+                                "type": "array",
+                                "items": {
+                                    "type": "object",
+                                    "properties": {name: {"type": "string"} for name in model.feature_names},
+                                },
+                            }
+                        },
+                        "required": ["inputs"],
                     },
-                    "required": ["inputs"],
                 },
             }
         ]
@@ -109,19 +112,19 @@ def generate_dataset(self, model: BaseModel, num_samples: int = 10, column_types
 
         """
         prompt = self._make_generate_input_prompt(model, num_samples)
-        functions = self._make_generate_input_functions(model, num_samples)
+        tools = self._make_generate_input_functions(model, num_samples)
 
         out = self.llm_client.complete(
             messages=[{"role": "system", "content": prompt}],
-            functions=functions,
-            function_call={"name": "generate_inputs"},
+            tools=tools,
+            tool_choice={"type": "function", "function": {"name": "generate_inputs"}},
             temperature=self.llm_temperature,
             caller_id=self.__class__.__name__,
         )
 
         try:
-            generated = out.function_call.args["inputs"]
-        except (AttributeError, KeyError) as err:
+            generated = out.tool_calls[0].function.arguments["inputs"]
+        except (AttributeError, KeyError, IndexError, TypeError) as err:
             raise LLMGenerationError("Could not parse generated inputs") from err
 
         dataset = Dataset(