Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"isRoot": true,
"tools": {
"microsoft.extensions.ai.evaluation.console": {
"version": "9.3.0-preview.1.25126.9",
"version": "9.3.0-preview.1.25164.6",
"commands": [
"aieval"
],
Expand Down
7 changes: 6 additions & 1 deletion src/microsoft-extensions-ai-evaluation/api/Examples.sln
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,16 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Reporting", "reporting\Repo
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Markdown Documents", "Markdown Documents", "{02EA681E-C7D8-13C7-8484-4AC65E1B71E8}"
ProjectSection(SolutionItems) = preProject
..\README.md = ..\README.md
INSTRUCTIONS.md = INSTRUCTIONS.md
..\README.md = ..\README.md
README.md = README.md
EndProjectSection
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Tool Configuration", "Tool Configuration", "{9BBB54DB-BF9B-43D0-9502-3347C9DAF717}"
ProjectSection(SolutionItems) = preProject
.config\dotnet-tools.json = .config\dotnet-tools.json
EndProjectSection
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@
<ItemGroup>
<PackageReference Include="Azure.AI.OpenAI" Version="2.1.0" />
<PackageReference Include="Azure.Identity" Version="1.13.2" />
<PackageReference Include="FluentAssertions" Version="8.0.1" />
<PackageReference Include="Microsoft.Extensions.AI.Abstractions" Version="9.3.0-preview.1.25114.11" />
<PackageReference Include="Microsoft.Extensions.AI.AzureAIInference" Version="9.3.0-preview.1.25114.11" />
<PackageReference Include="Microsoft.Extensions.AI.Evaluation" Version="9.3.0-preview.1.25126.9" />
<PackageReference Include="Microsoft.Extensions.AI.Evaluation.Quality" Version="9.3.0-preview.1.25126.9" />
<PackageReference Include="Microsoft.Extensions.AI.Ollama" Version="9.3.0-preview.1.25114.11" />
<PackageReference Include="Microsoft.Extensions.AI.OpenAI" Version="9.3.0-preview.1.25114.11" />
<PackageReference Include="FluentAssertions" Version="7.2.0" />
<PackageReference Include="Microsoft.Extensions.AI.Abstractions" Version="9.3.0-preview.1.25161.3" />
<PackageReference Include="Microsoft.Extensions.AI.AzureAIInference" Version="9.3.0-preview.1.25161.3" />
<PackageReference Include="Microsoft.Extensions.AI.Evaluation" Version="9.3.0-preview.1.25164.6" />
<PackageReference Include="Microsoft.Extensions.AI.Evaluation.Quality" Version="9.3.0-preview.1.25164.6" />
<PackageReference Include="Microsoft.Extensions.AI.Ollama" Version="9.3.0-preview.1.25161.3" />
<PackageReference Include="Microsoft.Extensions.AI.OpenAI" Version="9.3.0-preview.1.25161.3" />
<PackageReference Include="Microsoft.ML.Tokenizers.Data.O200kBase" Version="1.0.1" />
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.12.0" />
<PackageReference Include="MSTest.TestAdapter" Version="3.7.3" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ public async Task Example01_InvokingOneEvaluator()
/// interpretation can also be changed after the fact to suit your specific requirements if needed.
///
/// Validate the default interpretation for the returned coherence metric.
coherence.Interpretation!.Failed.Should().NotBe(true);
coherence.Interpretation!.Failed.Should().BeFalse();
coherence.Interpretation.Rating.Should().BeOneOf(EvaluationRating.Good, EvaluationRating.Exceptional);

/// Evaluators such as <see cref="CoherenceEvaluator"/> above can include diagnostics on the metrics they
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,11 @@ public partial class EvaluationExamples
[TestMethod]
public async Task Example02_InvokingMultipleEvaluators()
{
/// Create a <see cref="CompositeEvaluator"/> that composes a <see cref="CoherenceEvaluator"/>, a
/// <see cref="FluencyEvaluator"/> and a <see cref="RelevanceTruthAndCompletenessEvaluator"/>.
/// Create a <see cref="CompositeEvaluator"/> that composes a <see cref="CoherenceEvaluator"/> and a
/// <see cref="FluencyEvaluator"/>.
IEvaluator coherenceEvaluator = new CoherenceEvaluator();
IEvaluator fluencyEvaluator = new FluencyEvaluator();
IEvaluator rtcEvaluator = new RelevanceTruthAndCompletenessEvaluator();
IEvaluator compositeEvaluator = new CompositeEvaluator(coherenceEvaluator, fluencyEvaluator, rtcEvaluator);
IEvaluator compositeEvaluator = new CompositeEvaluator(coherenceEvaluator, fluencyEvaluator);

/// Invoke the <see cref="CompositeEvaluator"/> to evaluate the 'coherence', 'fluency', 'relevance', 'truth'
/// and 'completeness' of the response in <see cref="s_response"/>. The evaluation is performed using the LLM
Expand All @@ -30,39 +29,16 @@ public async Task Example02_InvokingMultipleEvaluators()

/// Retrieve the score for coherence from the <see cref="EvaluationResult"/>.
NumericMetric coherence = result.Get<NumericMetric>(CoherenceEvaluator.CoherenceMetricName);
coherence.Interpretation!.Failed.Should().NotBe(true);
coherence.Interpretation!.Failed.Should().BeFalse();
coherence.Interpretation.Rating.Should().BeOneOf(EvaluationRating.Good, EvaluationRating.Exceptional);
coherence.ContainsDiagnostics().Should().BeFalse();
coherence.Value.Should().BeGreaterThanOrEqualTo(3);

/// Retrieve the score for fluency from the <see cref="EvaluationResult"/>.
NumericMetric fluency = result.Get<NumericMetric>(FluencyEvaluator.FluencyMetricName);
fluency.Interpretation!.Failed.Should().NotBe(true);
fluency.Interpretation!.Failed.Should().BeFalse();
fluency.Interpretation.Rating.Should().BeOneOf(EvaluationRating.Good, EvaluationRating.Exceptional);
fluency.ContainsDiagnostics().Should().BeFalse();
fluency.Value.Should().BeGreaterThanOrEqualTo(3);

/// Retrieve the score for relevance from the <see cref="EvaluationResult"/>.
NumericMetric relevance =
result.Get<NumericMetric>(RelevanceTruthAndCompletenessEvaluator.RelevanceMetricName);
relevance.Interpretation!.Failed.Should().NotBe(true);
relevance.Interpretation.Rating.Should().BeOneOf(EvaluationRating.Good, EvaluationRating.Exceptional);
relevance.ContainsDiagnostics().Should().BeFalse();
relevance.Value.Should().BeGreaterThanOrEqualTo(3);

/// Retrieve the score for truth from the <see cref="EvaluationResult"/>.
NumericMetric truth = result.Get<NumericMetric>(RelevanceTruthAndCompletenessEvaluator.TruthMetricName);
truth.Interpretation!.Failed.Should().NotBe(true);
truth.Interpretation.Rating.Should().BeOneOf(EvaluationRating.Good, EvaluationRating.Exceptional);
truth.ContainsDiagnostics().Should().BeFalse();
truth.Value.Should().BeGreaterThanOrEqualTo(3);

/// Retrieve the score for completeness from the <see cref="EvaluationResult"/>.
NumericMetric completeness =
result.Get<NumericMetric>(RelevanceTruthAndCompletenessEvaluator.CompletenessMetricName);
completeness.Interpretation!.Failed.Should().NotBe(true);
completeness.Interpretation.Rating.Should().BeOneOf(EvaluationRating.Good, EvaluationRating.Exceptional);
completeness.ContainsDiagnostics().Should().BeFalse();
completeness.Value.Should().BeGreaterThanOrEqualTo(3);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,14 @@ await compositeEvaluator.EvaluateAsync(

/// Retrieve the score for equivalence from the <see cref="EvaluationResult"/>.
NumericMetric equivalence = result.Get<NumericMetric>(EquivalenceEvaluator.EquivalenceMetricName);
equivalence.Interpretation!.Failed.Should().NotBe(true);
equivalence.Interpretation!.Failed.Should().BeFalse();
equivalence.Interpretation.Rating.Should().BeOneOf(EvaluationRating.Good, EvaluationRating.Exceptional);
equivalence.ContainsDiagnostics().Should().BeFalse();
equivalence.Value.Should().BeGreaterThanOrEqualTo(3);

/// Retrieve the score for groundedness from the <see cref="EvaluationResult"/>.
NumericMetric groundedness = result.Get<NumericMetric>(GroundednessEvaluator.GroundednessMetricName);
groundedness.Interpretation!.Failed.Should().NotBe(true);
groundedness.Interpretation!.Failed.Should().BeFalse();
groundedness.Interpretation.Rating.Should().BeOneOf(EvaluationRating.Good, EvaluationRating.Exceptional);
groundedness.ContainsDiagnostics().Should().BeFalse();
groundedness.Value.Should().BeGreaterThanOrEqualTo(3);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,14 @@ public async Task Example04_InvokingCustomEvaluators()
/// Retrieve the detected measurement system from the <see cref="EvaluationResult"/>.
StringMetric measurementSystem =
result.Get<StringMetric>(MeasurementSystemEvaluator.MeasurementSystemMetricName);
measurementSystem.Interpretation!.Failed.Should().NotBe(true);
measurementSystem.Interpretation!.Failed.Should().BeFalse();
measurementSystem.Interpretation.Rating.Should().BeOneOf(EvaluationRating.Good, EvaluationRating.Exceptional);
measurementSystem.ContainsDiagnostics().Should().BeFalse();
measurementSystem.Value.Should().Be(nameof(MeasurementSystemEvaluator.MeasurementSystem.Imperial));

/// Retrieve the word count from the <see cref="EvaluationResult"/>.
NumericMetric wordCount = result.Get<NumericMetric>(WordCountEvaluator.WordCountMetricName);
wordCount.Interpretation!.Failed.Should().NotBe(true);
wordCount.Interpretation!.Failed.Should().BeFalse();
wordCount.Interpretation.Rating.Should().BeOneOf(EvaluationRating.Good, EvaluationRating.Exceptional);
wordCount.ContainsDiagnostics().Should().BeFalse();
wordCount.Value.Should().BeLessThanOrEqualTo(100);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ public async Task Example06_ChangingInterpretationOfMetrics()
return new EvaluationMetricInterpretation(
EvaluationRating.Unacceptable,
failed: true,
reason: "The response was empty");
reason: "The response was empty.");
}

return wordCount.Value switch
Expand All @@ -66,26 +66,26 @@ public async Task Example06_ChangingInterpretationOfMetrics()
new EvaluationMetricInterpretation(
EvaluationRating.Unacceptable,
failed: true,
reason: "The response was empty"),
reason: "The response was empty."),
<= 20 =>
new EvaluationMetricInterpretation(
EvaluationRating.Poor,
failed: true,
reason: "The response was too short"),
reason: "The response was too short."),
<= 100 =>
new EvaluationMetricInterpretation(
EvaluationRating.Good,
reason: "The response was of an acceptable length"),
reason: "The response was of an acceptable length."),
<= 200 =>
new EvaluationMetricInterpretation(
EvaluationRating.Poor,
failed: true,
reason: "The response was long"),
reason: "The response was long."),
_ =>
new EvaluationMetricInterpretation(
EvaluationRating.Unacceptable,
failed: true,
reason: "The response was too long")
reason: "The response was too long.")
};
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ Keep your responses concise staying under 100 words as much as possible.
ChatRole.User,
"How far is the planet Venus from the Earth at its closest and furthest points?")];

private static ChatMessage s_response = new();
private static ChatResponse s_response = new();

[ClassInitialize]
public static async Task InitializeAsync(TestContext _)
Expand All @@ -53,7 +53,6 @@ public static async Task InitializeAsync(TestContext _)
};

/// Fetch the response to be evaluated and store it in a static variable <see cref="s_response" />.
ChatResponse response = await s_chatConfiguration.ChatClient.GetResponseAsync(s_messages, chatOptions);
s_response = response.Message;
s_response = await s_chatConfiguration.ChatClient.GetResponseAsync(s_messages, chatOptions);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -135,26 +135,30 @@ private static void Interpret(StringMetric metric)
}
else if (Enum.TryParse(metric.Value, ignoreCase: true, out MeasurementSystem measurementSystem))
{
var reason = $"Detected measurement system was '{metric.Value}'.";
metric.Interpretation =
measurementSystem is MeasurementSystem.Imperial or MeasurementSystem.USCustomary
? new EvaluationMetricInterpretation(EvaluationRating.Good, reason: reason)
: new EvaluationMetricInterpretation(EvaluationRating.Unacceptable, failed: true, reason);
? new EvaluationMetricInterpretation(
EvaluationRating.Good,
reason: $"Detected measurement system '{metric.Value}' was part of expected set (i.e., either '{MeasurementSystem.Imperial} or '{MeasurementSystem.USCustomary}').")
: new EvaluationMetricInterpretation(
EvaluationRating.Unacceptable,
failed: true,
reason: $"Detected measurement system '{metric.Value}' was not part of expected set (i.e., neither '{MeasurementSystem.Imperial} nor '{MeasurementSystem.USCustomary}').");
}
else
{
metric.Interpretation =
new EvaluationMetricInterpretation(
EvaluationRating.Inconclusive,
failed: true,
reason: $"The detected measurement system '{metric.Value}' is not valid.");
reason: $"The detected measurement system '{metric.Value}' was not valid.");
}
}

/// <inheritdoc/>
public async ValueTask<EvaluationResult> EvaluateAsync(
IEnumerable<ChatMessage> messages,
ChatMessage modelResponse,
ChatResponse modelResponse,
ChatConfiguration? chatConfiguration = null,
IEnumerable<EvaluationContext>? additionalContext = null,
CancellationToken cancellationToken = default)
Expand Down Expand Up @@ -215,7 +219,14 @@ await chatConfiguration.ChatClient.GetResponseAsync(
/// Set the value of the <see cref="StringMetric"> (that we will return as part of the
/// <see cref="EvaluationResult"/> below) to be the text of the LLM's response (which should contain the name of
/// the detected measurement system).
metric.Value = evaluationResponse.Message.Text;
metric.Value = evaluationResponse.Text;

/// Include a reason that provides some commentary around the result. An <see cref="IEvaluator"/> can
/// optionally include such commentary to explain the scores present within any <see cref="EvaluationMetric"/>
/// that it returns. Note that although we hand craft the reason below, with some tweaks to the above prompt,
/// we could also have asked the LLM to include an explanation for the returned metric value in its response
/// and used that instead.
metric.Reason = $"The detected measurement system was '{metric.Value}'.";

/// Attach a default <see cref="EvaluationMetricInterpretation"/> for the metric. An evaluator can provide a
/// default interpretation for each metric that it produces. This default interpretation can be overridden by
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,27 +56,36 @@ private static void Interpret(NumericMetric metric)
}
else
{
var reason = $"The response was {metric.Value} words long.";
metric.Interpretation =
metric.Value <= 100
? new EvaluationMetricInterpretation(EvaluationRating.Good, reason: reason)
: new EvaluationMetricInterpretation(EvaluationRating.Unacceptable, failed: true, reason);
? new EvaluationMetricInterpretation(
EvaluationRating.Good,
reason: "The response was shorter than 100 words.")
: new EvaluationMetricInterpretation(
EvaluationRating.Unacceptable,
failed: true,
reason: "The response was longer than 100 words.");
}
}

/// <inheritdoc/>
public ValueTask<EvaluationResult> EvaluateAsync(
IEnumerable<ChatMessage> messages,
ChatMessage modelResponse,
ChatResponse modelResponse,
ChatConfiguration? chatConfiguration = null,
IEnumerable<EvaluationContext>? additionalContext = null,
CancellationToken cancellationToken = default)
{
/// Count the number of words in the supplied <see cref="modelResponse"/>.
int wordCount = CountWords(modelResponse.Text);

/// Create a <see cref="NumericMetric"/> with value set to the word count.
var metric = new NumericMetric(WordCountMetricName, value: wordCount);
var reason =
$"This {WordCountMetricName} metric has value {wordCount} because the evaluated model response contained {wordCount} words.";

/// Create a <see cref="NumericMetric"/> with value set to the word count. Also include a reason that provides
/// some commentary around the result. An <see cref="IEvaluator"/> can optionally include such commentary
/// to explain the scores present within any <see cref="EvaluationMetric"/> that it returns.
var metric = new NumericMetric(WordCountMetricName, value: wordCount, reason);

/// Attach a default <see cref="EvaluationMetricInterpretation"/> for the metric. An evaluator can provide a
/// default interpretation for each metric that it produces. This default interpretation can be overridden by
Expand Down
Loading