dotnet · shyamnamboodiripad · Mar 14, 2025 · Mar 11, 2025 · Mar 11, 2025 · Mar 13, 2025
diff --git a/src/microsoft-extensions-ai-evaluation/api/.config/dotnet-tools.json b/src/microsoft-extensions-ai-evaluation/api/.config/dotnet-tools.json
@@ -3,7 +3,7 @@
   "isRoot": true,
   "tools": {
     "microsoft.extensions.ai.evaluation.console": {
-      "version": "9.3.0-preview.1.25126.9",
+      "version": "9.3.0-preview.1.25164.6",
       "commands": [
         "aieval"
       ],

diff --git a/src/microsoft-extensions-ai-evaluation/api/Examples.sln b/src/microsoft-extensions-ai-evaluation/api/Examples.sln
@@ -9,11 +9,16 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Reporting", "reporting\Repo
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Markdown Documents", "Markdown Documents", "{02EA681E-C7D8-13C7-8484-4AC65E1B71E8}"
 	ProjectSection(SolutionItems) = preProject
-		..\README.md = ..\README.md
 		INSTRUCTIONS.md = INSTRUCTIONS.md
+		..\README.md = ..\README.md
 		README.md = README.md
 	EndProjectSection
 EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Tool Configuration", "Tool Configuration", "{9BBB54DB-BF9B-43D0-9502-3347C9DAF717}"
+	ProjectSection(SolutionItems) = preProject
+		.config\dotnet-tools.json = .config\dotnet-tools.json
+	EndProjectSection
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU

diff --git a/src/microsoft-extensions-ai-evaluation/api/evaluation/Evaluation.csproj b/src/microsoft-extensions-ai-evaluation/api/evaluation/Evaluation.csproj
@@ -12,13 +12,13 @@
   <ItemGroup>
     <PackageReference Include="Azure.AI.OpenAI" Version="2.1.0" />
     <PackageReference Include="Azure.Identity" Version="1.13.2" />
-    <PackageReference Include="FluentAssertions" Version="8.0.1" />
-    <PackageReference Include="Microsoft.Extensions.AI.Abstractions" Version="9.3.0-preview.1.25114.11" />
-    <PackageReference Include="Microsoft.Extensions.AI.AzureAIInference" Version="9.3.0-preview.1.25114.11" />
-    <PackageReference Include="Microsoft.Extensions.AI.Evaluation" Version="9.3.0-preview.1.25126.9" />
-    <PackageReference Include="Microsoft.Extensions.AI.Evaluation.Quality" Version="9.3.0-preview.1.25126.9" />
-    <PackageReference Include="Microsoft.Extensions.AI.Ollama" Version="9.3.0-preview.1.25114.11" />
-    <PackageReference Include="Microsoft.Extensions.AI.OpenAI" Version="9.3.0-preview.1.25114.11" />
+    <PackageReference Include="FluentAssertions" Version="7.2.0" />
+    <PackageReference Include="Microsoft.Extensions.AI.Abstractions" Version="9.3.0-preview.1.25161.3" />
+    <PackageReference Include="Microsoft.Extensions.AI.AzureAIInference" Version="9.3.0-preview.1.25161.3" />
+    <PackageReference Include="Microsoft.Extensions.AI.Evaluation" Version="9.3.0-preview.1.25164.6" />
+    <PackageReference Include="Microsoft.Extensions.AI.Evaluation.Quality" Version="9.3.0-preview.1.25164.6" />
+    <PackageReference Include="Microsoft.Extensions.AI.Ollama" Version="9.3.0-preview.1.25161.3" />
+    <PackageReference Include="Microsoft.Extensions.AI.OpenAI" Version="9.3.0-preview.1.25161.3" />
     <PackageReference Include="Microsoft.ML.Tokenizers.Data.O200kBase" Version="1.0.1" />
     <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.12.0" />
     <PackageReference Include="MSTest.TestAdapter" Version="3.7.3" />

diff --git a/...ensions-ai-evaluation/api/evaluation/EvaluationExamples.Example01_InvokingOneEvaluator.cs b/...ensions-ai-evaluation/api/evaluation/EvaluationExamples.Example01_InvokingOneEvaluator.cs
@@ -42,7 +42,7 @@ public async Task Example01_InvokingOneEvaluator()
         /// interpretation can also be changed after the fact to suit your specific requirements if needed.
         /// 
         /// Validate the default interpretation for the returned coherence metric.
-        coherence.Interpretation!.Failed.Should().NotBe(true);
+        coherence.Interpretation!.Failed.Should().BeFalse();
         coherence.Interpretation.Rating.Should().BeOneOf(EvaluationRating.Good, EvaluationRating.Exceptional);
 
         /// Evaluators such as <see cref="CoherenceEvaluator"/> above can include diagnostics on the metrics they

diff --git a/...s-ai-evaluation/api/evaluation/EvaluationExamples.Example02_InvokingMultipleEvaluators.cs b/...s-ai-evaluation/api/evaluation/EvaluationExamples.Example02_InvokingMultipleEvaluators.cs
@@ -14,12 +14,11 @@ public partial class EvaluationExamples
     [TestMethod]
     public async Task Example02_InvokingMultipleEvaluators()
     {
-        /// Create a <see cref="CompositeEvaluator"/> that composes a <see cref="CoherenceEvaluator"/>, a
-        /// <see cref="FluencyEvaluator"/> and a <see cref="RelevanceTruthAndCompletenessEvaluator"/>.
+        /// Create a <see cref="CompositeEvaluator"/> that composes a <see cref="CoherenceEvaluator"/> and a
+        /// <see cref="FluencyEvaluator"/>.
         IEvaluator coherenceEvaluator = new CoherenceEvaluator();
         IEvaluator fluencyEvaluator = new FluencyEvaluator();
-        IEvaluator rtcEvaluator = new RelevanceTruthAndCompletenessEvaluator();
-        IEvaluator compositeEvaluator = new CompositeEvaluator(coherenceEvaluator, fluencyEvaluator, rtcEvaluator);
+        IEvaluator compositeEvaluator = new CompositeEvaluator(coherenceEvaluator, fluencyEvaluator);
 
         /// Invoke the <see cref="CompositeEvaluator"/> to evaluate the 'coherence', 'fluency', 'relevance', 'truth'
         /// and 'completeness' of the response in <see cref="s_response"/>. The evaluation is performed using the LLM
@@ -30,39 +29,16 @@ public async Task Example02_InvokingMultipleEvaluators()
 
         /// Retrieve the score for coherence from the <see cref="EvaluationResult"/>.
         NumericMetric coherence = result.Get<NumericMetric>(CoherenceEvaluator.CoherenceMetricName);
-        coherence.Interpretation!.Failed.Should().NotBe(true);
+        coherence.Interpretation!.Failed.Should().BeFalse();
         coherence.Interpretation.Rating.Should().BeOneOf(EvaluationRating.Good, EvaluationRating.Exceptional);
         coherence.ContainsDiagnostics().Should().BeFalse();
         coherence.Value.Should().BeGreaterThanOrEqualTo(3);
 
         /// Retrieve the score for fluency from the <see cref="EvaluationResult"/>.
         NumericMetric fluency = result.Get<NumericMetric>(FluencyEvaluator.FluencyMetricName);
-        fluency.Interpretation!.Failed.Should().NotBe(true);
+        fluency.Interpretation!.Failed.Should().BeFalse();
         fluency.Interpretation.Rating.Should().BeOneOf(EvaluationRating.Good, EvaluationRating.Exceptional);
         fluency.ContainsDiagnostics().Should().BeFalse();
         fluency.Value.Should().BeGreaterThanOrEqualTo(3);
-
-        /// Retrieve the score for relevance from the <see cref="EvaluationResult"/>. 
-        NumericMetric relevance =
-            result.Get<NumericMetric>(RelevanceTruthAndCompletenessEvaluator.RelevanceMetricName);
-        relevance.Interpretation!.Failed.Should().NotBe(true);
-        relevance.Interpretation.Rating.Should().BeOneOf(EvaluationRating.Good, EvaluationRating.Exceptional);
-        relevance.ContainsDiagnostics().Should().BeFalse();
-        relevance.Value.Should().BeGreaterThanOrEqualTo(3);
-
-        /// Retrieve the score for truth from the <see cref="EvaluationResult"/>. 
-        NumericMetric truth = result.Get<NumericMetric>(RelevanceTruthAndCompletenessEvaluator.TruthMetricName);
-        truth.Interpretation!.Failed.Should().NotBe(true);
-        truth.Interpretation.Rating.Should().BeOneOf(EvaluationRating.Good, EvaluationRating.Exceptional);
-        truth.ContainsDiagnostics().Should().BeFalse();
-        truth.Value.Should().BeGreaterThanOrEqualTo(3);
-
-        /// Retrieve the score for completeness from the <see cref="EvaluationResult"/>. 
-        NumericMetric completeness =
-            result.Get<NumericMetric>(RelevanceTruthAndCompletenessEvaluator.CompletenessMetricName);
-        completeness.Interpretation!.Failed.Should().NotBe(true);
-        completeness.Interpretation.Rating.Should().BeOneOf(EvaluationRating.Good, EvaluationRating.Exceptional);
-        completeness.ContainsDiagnostics().Should().BeFalse();
-        completeness.Value.Should().BeGreaterThanOrEqualTo(3);
     }
 }
diff --git a/...pi/evaluation/EvaluationExamples.Example03_InvokingEvaluatorsThatNeedAdditionalContext.cs b/...pi/evaluation/EvaluationExamples.Example03_InvokingEvaluatorsThatNeedAdditionalContext.cs
@@ -58,14 +58,14 @@ await compositeEvaluator.EvaluateAsync(
 
         /// Retrieve the score for equivalence from the <see cref="EvaluationResult"/>.
         NumericMetric equivalence = result.Get<NumericMetric>(EquivalenceEvaluator.EquivalenceMetricName);
-        equivalence.Interpretation!.Failed.Should().NotBe(true);
+        equivalence.Interpretation!.Failed.Should().BeFalse();
         equivalence.Interpretation.Rating.Should().BeOneOf(EvaluationRating.Good, EvaluationRating.Exceptional);
         equivalence.ContainsDiagnostics().Should().BeFalse();
         equivalence.Value.Should().BeGreaterThanOrEqualTo(3);
 
         /// Retrieve the score for groundedness from the <see cref="EvaluationResult"/>.
         NumericMetric groundedness = result.Get<NumericMetric>(GroundednessEvaluator.GroundednessMetricName);
-        groundedness.Interpretation!.Failed.Should().NotBe(true);
+        groundedness.Interpretation!.Failed.Should().BeFalse();
         groundedness.Interpretation.Rating.Should().BeOneOf(EvaluationRating.Good, EvaluationRating.Exceptional);
         groundedness.ContainsDiagnostics().Should().BeFalse();
         groundedness.Value.Should().BeGreaterThanOrEqualTo(3);

diff --git a/...ons-ai-evaluation/api/evaluation/EvaluationExamples.Example04_InvokingCustomEvaluators.cs b/...ons-ai-evaluation/api/evaluation/EvaluationExamples.Example04_InvokingCustomEvaluators.cs
@@ -34,14 +34,14 @@ public async Task Example04_InvokingCustomEvaluators()
         /// Retrieve the detected measurement system from the <see cref="EvaluationResult"/>.
         StringMetric measurementSystem =
             result.Get<StringMetric>(MeasurementSystemEvaluator.MeasurementSystemMetricName);
-        measurementSystem.Interpretation!.Failed.Should().NotBe(true);
+        measurementSystem.Interpretation!.Failed.Should().BeFalse();
         measurementSystem.Interpretation.Rating.Should().BeOneOf(EvaluationRating.Good, EvaluationRating.Exceptional);
         measurementSystem.ContainsDiagnostics().Should().BeFalse();
         measurementSystem.Value.Should().Be(nameof(MeasurementSystemEvaluator.MeasurementSystem.Imperial));
 
         /// Retrieve the word count from the <see cref="EvaluationResult"/>.
         NumericMetric wordCount = result.Get<NumericMetric>(WordCountEvaluator.WordCountMetricName);
-        wordCount.Interpretation!.Failed.Should().NotBe(true);
+        wordCount.Interpretation!.Failed.Should().BeFalse();
         wordCount.Interpretation.Rating.Should().BeOneOf(EvaluationRating.Good, EvaluationRating.Exceptional);
         wordCount.ContainsDiagnostics().Should().BeFalse();
         wordCount.Value.Should().BeLessThanOrEqualTo(100);

diff --git a/...evaluation/api/evaluation/EvaluationExamples.Example06_ChangingInterpretationOfMetrics.cs b/...evaluation/api/evaluation/EvaluationExamples.Example06_ChangingInterpretationOfMetrics.cs
@@ -57,7 +57,7 @@ public async Task Example06_ChangingInterpretationOfMetrics()
             return new EvaluationMetricInterpretation(
                 EvaluationRating.Unacceptable,
                 failed: true,
-                reason: "The response was empty");
+                reason: "The response was empty.");
         }
 
         return wordCount.Value switch
@@ -66,26 +66,26 @@ public async Task Example06_ChangingInterpretationOfMetrics()
                 new EvaluationMetricInterpretation(
                     EvaluationRating.Unacceptable,
                     failed: true,
-                    reason: "The response was empty"),
+                    reason: "The response was empty."),
             <= 20 =>
                 new EvaluationMetricInterpretation(
                     EvaluationRating.Poor,
                     failed: true,
-                    reason: "The response was too short"),
+                    reason: "The response was too short."),
             <= 100 =>
                 new EvaluationMetricInterpretation(
                     EvaluationRating.Good,
-                    reason: "The response was of an acceptable length"),
+                    reason: "The response was of an acceptable length."),
             <= 200 =>
                 new EvaluationMetricInterpretation(
                     EvaluationRating.Poor,
                     failed: true,
-                    reason: "The response was long"),
+                    reason: "The response was long."),
             _ =>
                 new EvaluationMetricInterpretation(
                     EvaluationRating.Unacceptable,
                     failed: true,
-                    reason: "The response was too long")
+                    reason: "The response was too long.")
         };
     }
 }
diff --git a/src/microsoft-extensions-ai-evaluation/api/evaluation/EvaluationExamples.cs b/src/microsoft-extensions-ai-evaluation/api/evaluation/EvaluationExamples.cs
@@ -36,7 +36,7 @@ Keep your responses concise staying under 100 words as much as possible.
             ChatRole.User,
             "How far is the planet Venus from the Earth at its closest and furthest points?")];
 
-    private static ChatMessage s_response = new();
+    private static ChatResponse s_response = new();
 
     [ClassInitialize]
     public static async Task InitializeAsync(TestContext _)
@@ -53,7 +53,6 @@ public static async Task InitializeAsync(TestContext _)
             };
 
         /// Fetch the response to be evaluated and store it in a static variable <see cref="s_response" />.
-        ChatResponse response = await s_chatConfiguration.ChatClient.GetResponseAsync(s_messages, chatOptions);
-        s_response = response.Message;
+        s_response = await s_chatConfiguration.ChatClient.GetResponseAsync(s_messages, chatOptions);
     }
 }
diff --git a/...icrosoft-extensions-ai-evaluation/api/evaluation/Evaluators/MeasurementSystemEvaluator.cs b/...icrosoft-extensions-ai-evaluation/api/evaluation/Evaluators/MeasurementSystemEvaluator.cs
@@ -135,26 +135,30 @@ private static void Interpret(StringMetric metric)
         }
         else if (Enum.TryParse(metric.Value, ignoreCase: true, out MeasurementSystem measurementSystem))
         {
-            var reason = $"Detected measurement system was '{metric.Value}'.";
             metric.Interpretation =
                 measurementSystem is MeasurementSystem.Imperial or MeasurementSystem.USCustomary
-                    ? new EvaluationMetricInterpretation(EvaluationRating.Good, reason: reason)
-                    : new EvaluationMetricInterpretation(EvaluationRating.Unacceptable, failed: true, reason);
+                    ? new EvaluationMetricInterpretation(
+                        EvaluationRating.Good,
+                        reason: $"Detected measurement system '{metric.Value}' was part of expected set (i.e., either '{MeasurementSystem.Imperial} or '{MeasurementSystem.USCustomary}').")
+                    : new EvaluationMetricInterpretation(
+                        EvaluationRating.Unacceptable,
+                        failed: true,
+                        reason: $"Detected measurement system '{metric.Value}' was not part of expected set (i.e., neither '{MeasurementSystem.Imperial} nor '{MeasurementSystem.USCustomary}').");
         }
         else
         {
             metric.Interpretation =
                 new EvaluationMetricInterpretation(
                     EvaluationRating.Inconclusive,
                     failed: true,
-                    reason: $"The detected measurement system '{metric.Value}' is not valid.");
+                    reason: $"The detected measurement system '{metric.Value}' was not valid.");
         }
     }
 
     /// <inheritdoc/>
     public async ValueTask<EvaluationResult> EvaluateAsync(
         IEnumerable<ChatMessage> messages,
-        ChatMessage modelResponse,
+        ChatResponse modelResponse,
         ChatConfiguration? chatConfiguration = null,
         IEnumerable<EvaluationContext>? additionalContext = null,
         CancellationToken cancellationToken = default)
@@ -215,7 +219,14 @@ await chatConfiguration.ChatClient.GetResponseAsync(
         /// Set the value of the <see cref="StringMetric"> (that we will return as part of the
         /// <see cref="EvaluationResult"/> below) to be the text of the LLM's response (which should contain the name of
         /// the detected measurement system).
-        metric.Value = evaluationResponse.Message.Text;
+        metric.Value = evaluationResponse.Text;
+
+        /// Include a reason that provides some commentary around the result. An <see cref="IEvaluator"/> can
+        /// optionally include such commentary to explain the scores present within any <see cref="EvaluationMetric"/>
+        /// that it returns. Note that although we hand craft the reason below, with some tweaks to the above prompt,
+        /// we could also have asked the LLM to include an explanation for the returned metric value in its response
+        /// and used that instead.
+        metric.Reason = $"The detected measurement system was '{metric.Value}'.";
 
         /// Attach a default <see cref="EvaluationMetricInterpretation"/> for the metric. An evaluator can provide a
         /// default interpretation for each metric that it produces. This default interpretation can be overridden by

diff --git a/src/microsoft-extensions-ai-evaluation/api/evaluation/Evaluators/WordCountEvaluator.cs b/src/microsoft-extensions-ai-evaluation/api/evaluation/Evaluators/WordCountEvaluator.cs
@@ -56,27 +56,36 @@ private static void Interpret(NumericMetric metric)
         }
         else
         {
-            var reason = $"The response was {metric.Value} words long.";
             metric.Interpretation =
                 metric.Value <= 100
-                    ? new EvaluationMetricInterpretation(EvaluationRating.Good, reason: reason)
-                    : new EvaluationMetricInterpretation(EvaluationRating.Unacceptable, failed: true, reason);
+                    ? new EvaluationMetricInterpretation(
+                        EvaluationRating.Good,
+                        reason: "The response was shorter than 100 words.")
+                    : new EvaluationMetricInterpretation(
+                        EvaluationRating.Unacceptable,
+                        failed: true,
+                        reason: "The response was longer than 100 words.");
         }
     }
 
     /// <inheritdoc/>
     public ValueTask<EvaluationResult> EvaluateAsync(
         IEnumerable<ChatMessage> messages,
-        ChatMessage modelResponse,
+        ChatResponse modelResponse,
         ChatConfiguration? chatConfiguration = null,
         IEnumerable<EvaluationContext>? additionalContext = null,
         CancellationToken cancellationToken = default)
     {
         /// Count the number of words in the supplied <see cref="modelResponse"/>.
         int wordCount = CountWords(modelResponse.Text);
 
-        /// Create a <see cref="NumericMetric"/> with value set to the word count.
-        var metric = new NumericMetric(WordCountMetricName, value: wordCount);
+        var reason =
+            $"This {WordCountMetricName} metric has value {wordCount} because the evaluated model response contained {wordCount} words.";
+
+        /// Create a <see cref="NumericMetric"/> with value set to the word count. Also include a reason that provides
+        /// some commentary around the result. An <see cref="IEvaluator"/> can optionally include such commentary
+        /// to explain the scores present within any <see cref="EvaluationMetric"/> that it returns.
+        var metric = new NumericMetric(WordCountMetricName, value: wordCount, reason);
 
         /// Attach a default <see cref="EvaluationMetricInterpretation"/> for the metric. An evaluator can provide a
         /// default interpretation for each metric that it produces. This default interpretation can be overridden by