diff --git a/giskard/rag/metrics/correctness.py b/giskard/rag/metrics/correctness.py
index e2937838cb..ed3d3e333d 100644
--- a/giskard/rag/metrics/correctness.py
+++ b/giskard/rag/metrics/correctness.py
@@ -109,13 +109,21 @@ def __call__(self, question_sample: dict, answer: AgentAnswer) -> dict:
                 temperature=0,
                 format="json_object",
             )
-            return parse_json_output(
+
+            json_output = parse_json_output(
                 out.content,
                 llm_client=llm_client,
                 keys=["correctness", "correctness_reason"],
                 caller_id=self.__class__.__name__,
             )
 
+            if "correctness" in json_output and not isinstance(json_output["correctness"], bool):
+                raise LLMGenerationError(
+                    f"Error in correctness evaluation: {json_output['correctness']}. Please make sure the agent answer is correctly formatted."
+                )
+
+            return json_output
+
         except Exception as err:
             raise LLMGenerationError("Error while evaluating the agent") from err