dotnet · peterwald · Jul 1, 2025 · Jun 30, 2025 · Jun 30, 2025 · Jun 30, 2025
@@ -6,7 +6,7 @@
 * [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Retrieval, Equivalence and Groundedness.
 * [`Microsoft.Extensions.AI.Evaluation.Safety`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) - Contains a set of evaluators that are built atop the Azure AI Foundry Evaluation service that can be used to evaluate the content safety of AI responses in your projects including Protected Material, Groundedness Pro, Ungrounded Attributes, Hate and Unfairness, Self Harm, Violence, Sexual, Code Vulnerability and Indirect Attack.
 * [`Microsoft.Extensions.AI.Evaluation.NLP`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.NLP) - Contains a set of evaluators that implement common algorithms for evaluating machine translation and natural
-language processing tasks. Evaluators currently include BLEU score, with more planned.
+language processing tasks. Evaluators currently include BLEU, GLEU and F1 scores.
 * [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data.
 * [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container.
 * [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data.

@@ -1,6 +1,7 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+using System;
 using System.Collections.Generic;
 using System.Globalization;
 using System.Linq;
@@ -79,8 +80,8 @@ public ValueTask<EvaluationResult> EvaluateAsync(
 
         var (score, duration) = TimingHelper.ExecuteWithTiming(() =>
         {
-            var references = context.References.Select(reference => SimpleWordTokenizer.WordTokenize(reference));
-            var hypothesis = SimpleWordTokenizer.WordTokenize(modelResponse.Text);
+            var references = context.References.Select(reference => SimpleWordTokenizer.WordTokenize(reference).ToArray()).ToArray();
+            var hypothesis = SimpleWordTokenizer.WordTokenize(modelResponse.Text).ToArray();
             return BLEUAlgorithm.SentenceBLEU(references, hypothesis, BLEUAlgorithm.DefaultBLEUWeights, SmoothingFunction.Method4);
         });
 

@@ -41,8 +41,8 @@ public sealed class BLEUEvaluatorContext : EvaluationContext
     /// <param name="references">
     /// The reference responses against which the response that is being evaluated is compared.
     /// </param>
-    public BLEUEvaluatorContext(params string[] references)
-        : this(references as IEnumerable<string>)
+    public BLEUEvaluatorContext(IEnumerable<string> references)
+        : this(references.ToArray())
     {
     }
 
@@ -52,11 +52,12 @@ public BLEUEvaluatorContext(params string[] references)
     /// <param name="references">
     /// The reference responses against which the response that is being evaluated is compared.
     /// </param>
-    public BLEUEvaluatorContext(IEnumerable<string> references)
+    public BLEUEvaluatorContext(params string[] references)
         : base(
             name: BLEUContextName,
             contents: [.. references.Select(c => new TextContent(c))])
     {
         References = [.. references];
     }
+
 }
@@ -16,7 +16,7 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP.Common;
 /// </summary>
 internal static class BLEUAlgorithm
 {
-    internal static int ClosestRefLength(IEnumerable<IEnumerable<string>> references, int hypLength)
+    internal static int ClosestRefLength(string[][] references, int hypLength)
     {
         if (!references.Any())
         {
@@ -27,7 +27,7 @@ internal static int ClosestRefLength(IEnumerable<IEnumerable<string>> references
         int smallestDiff = int.MaxValue;
         foreach (var reference in references)
         {
-            int refLength = reference.Count();
+            int refLength = reference.Length;
             int diff = Math.Abs(refLength - hypLength);
             if (diff < smallestDiff ||
                (diff == smallestDiff && refLength < closestRefLength))
@@ -55,26 +55,26 @@ internal static double BrevityPenalty(int closestRefLength, int hypLength)
         return Math.Exp(1 - ((double)closestRefLength / hypLength));
     }
 
-    internal static RationalNumber ModifiedPrecision(IEnumerable<IEnumerable<string>> references, IEnumerable<string> hypothesis, int n = 1)
+    internal static RationalNumber ModifiedPrecision(string[][] references, string[] hypothesis, int n = 1)
     {
         if (n <= 0)
         {
             Throw.ArgumentOutOfRangeException(nameof(n), $"`{nameof(n)}` must be greater than zero.");
         }
 
-        if (!references.Any() || !hypothesis.Any())
+        if (references.Length == 0 || hypothesis.Length == 0)
         {
             return RationalNumber.Zero;
         }
 
-        var hyp = hypothesis.CreateNGrams(n);
-        var hypCounts = new MatchCounter<NGram<string>>(hyp);
+        var hypGrams = hypothesis.AsSpan().CreateNGrams(n);
+        var hypCounts = new MatchCounter<NGram<string>>(hypGrams);
 
         Dictionary<NGram<string>, int> maxCounts = [];
 
         foreach (var rf in references)
         {
-            IEnumerable<NGram<string>> refGrams = rf.CreateNGrams(n);
+            IEnumerable<NGram<string>> refGrams = rf.AsSpan().CreateNGrams(n);
             var refCounts = new MatchCounter<NGram<string>>(refGrams);
 
             foreach (var ct in refCounts)
@@ -123,25 +123,28 @@ internal static double[] EqualWeights(int n)
         }
 
         double[] weights = new double[n];
+#if NET8_0_OR_GREATER
+        Array.Fill(weights, 1.0 / n);
+#else
         for (int i = 0; i < n; i++)
         {
             weights[i] = 1.0 / n;
         }
-
+#endif
         return weights;
     }
 
     internal static readonly double[] DefaultBLEUWeights = EqualWeights(4);
 
-    internal static double SentenceBLEU(IEnumerable<IEnumerable<string>> references, IEnumerable<string> hypothesis,
+    internal static double SentenceBLEU(string[][] references, string[] hypothesis,
         double[]? weights = null, Func<RationalNumber[], int, double[]>? smoothingFunction = null)
     {
-        if (references == null || !references.Any())
+        if (references == null || references.Length == 0)
         {
             Throw.ArgumentNullException(nameof(references), $"'{nameof(references)}' cannot be null or empty.");
         }
 
-        if (hypothesis == null || !hypothesis.Any())
+        if (hypothesis == null || hypothesis.Length == 0)
         {
             Throw.ArgumentNullException(nameof(hypothesis), $"'{nameof(hypothesis)}' cannot be null or empty.");
         }
@@ -171,7 +174,7 @@ internal static double SentenceBLEU(IEnumerable<IEnumerable<string>> references,
             precisionValues[i] = prec;
         }
 
-        int hypLen = hypothesis.Count();
+        int hypLen = hypothesis.Length;
         int closestRefLength = ClosestRefLength(references, hypLen);
         double brevityPenalty = BrevityPenalty(closestRefLength, hypLen);
 

@@ -0,0 +1,41 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Collections.Generic;
+using System.Linq;
+using Microsoft.Shared.Diagnostics;
+
+namespace Microsoft.Extensions.AI.Evaluation.NLP.Common;
+
+internal static class F1Algorithm
+{
+    public static double CalculateF1Score(IEnumerable<string> groundTruth, IEnumerable<string> response)
+    {
+        if (groundTruth == null || !groundTruth.Any())
+        {
+            Throw.ArgumentNullException(nameof(groundTruth), $"'{nameof(groundTruth)}' cannot be null or empty.");
+        }
+
+        if (response == null || !response.Any())
+        {
+            Throw.ArgumentNullException(nameof(response), $"'{nameof(response)}' cannot be null or empty.");
+        }
+
+        var referenceTokens = new MatchCounter<string>(groundTruth);
+        var predictionTokens = new MatchCounter<string>(response);
+        var commonTokens = referenceTokens.Intersect(predictionTokens);
+        int numCommonTokens = commonTokens.Sum();
+
+        if (numCommonTokens == 0)
+        {
+            return 0.0; // F1 score is 0 if there are no common tokens
+        }
+        else
+        {
+            double precision = (double)numCommonTokens / response.Count();
+            double recall = (double)numCommonTokens / groundTruth.Count();
+            double f1 = (2.0 * precision * recall) / (precision + recall);
+            return f1;
+        }
+    }
+}
@@ -0,0 +1,62 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.Collections.Generic;
+using Microsoft.Shared.Diagnostics;
+
+namespace Microsoft.Extensions.AI.Evaluation.NLP.Common;
+
+internal static class GLEUAlgorithm
+{
+    internal static double SentenceGLEU(string[][] references, string[] hypothesis, int minN = 1, int maxN = 4)
+    {
+        if (references == null || references.Length == 0)
+        {
+            Throw.ArgumentNullException(nameof(references), $"'{nameof(references)}' cannot be null or empty.");
+        }
+
+        if (hypothesis == null || hypothesis.Length == 0)
+        {
+            Throw.ArgumentNullException(nameof(hypothesis), $"'{nameof(hypothesis)}' cannot be null or empty.");
+        }
+
+        MatchCounter<NGram<string>> hypNGrams = new(hypothesis.AsSpan().CreateAllNGrams(minN, maxN));
+        int truePosFalsePos = hypNGrams.Sum();
+
+        List<(int, int)> hypCounts = [];
+        foreach (var reference in references)
+        {
+            MatchCounter<NGram<string>> refNGrams = new(reference.AsSpan().CreateAllNGrams(minN, maxN));
+            int truePosFalseNeg = refNGrams.Sum();
+
+            MatchCounter<NGram<string>> overlapNGrams = hypNGrams.Intersect(refNGrams);
+            int truePos = overlapNGrams.Sum();
+
+            int nAll = Math.Max(truePosFalsePos, truePosFalseNeg);
+
+            if (nAll > 0)
+            {
+                hypCounts.Add((truePos, nAll));
+            }
+        }
+
+        int corpusNMatch = 0;
+        int corpusNAll = 0;
+
+        foreach (var (truePos, nAll) in hypCounts)
+        {
+            corpusNMatch += truePos;
+            corpusNAll += nAll;
+        }
+
+        if (corpusNAll == 0)
+        {
+            return 0.0;
+        }
+        else
+        {
+            return (double)corpusNMatch / corpusNAll;
+        }
+    }
+}
@@ -53,7 +53,22 @@ public void AddRange(IEnumerable<T> items)
         }
     }
 
-    public string ToDebugString() => string.Concat(_counts.Select(v => $"{v.Key}: {v.Value}, "));
+    public MatchCounter<T> Intersect(MatchCounter<T> other)
+    {
+        _ = Throw.IfNull(other, nameof(other));
+        var intersection = new MatchCounter<T>();
+        foreach (var kvp in _counts)
+        {
+            if (other._counts.TryGetValue(kvp.Key, out int otherCount))
+            {
+                intersection._counts[kvp.Key] = Math.Min(kvp.Value, otherCount);
+            }
+        }
+
+        return intersection;
+    }
+
+    public string ToDebugString() => string.Join(",", _counts.Select(v => $"{v.Key}: {v.Value}"));
 
     public IEnumerator<KeyValuePair<T, int>> GetEnumerator() => _counts.GetEnumerator();
 

@@ -3,7 +3,6 @@
 
 using System;
 using System.Collections.Generic;
-using System.Linq;
 using Microsoft.Shared.Diagnostics;
 
 namespace Microsoft.Extensions.AI.Evaluation.NLP.Common;
@@ -14,28 +13,73 @@ internal static class NGramExtensions
     public static NGram<T> CreateNGram<T>(this ReadOnlySpan<T> values)
         where T : IEquatable<T> => new(values);
 
+    internal static List<NGram<T>> CreateNGrams<T>(this Span<T> input, int n)
+        where T : IEquatable<T>
+        => CreateNGrams((ReadOnlySpan<T>)input, n);
+
     /// <summary>
     /// Create a sequence of n-grams from the input sequence.
     /// </summary>
     /// <param name="input">The input sequence of items.</param>
     /// <param name="n">The size of each n-gram.</param>
-    internal static IEnumerable<NGram<T>> CreateNGrams<T>(this IEnumerable<T> input, int n)
+    internal static List<NGram<T>> CreateNGrams<T>(this ReadOnlySpan<T> input, int n)
         where T : IEquatable<T>
     {
         if (n <= 0)
         {
             Throw.ArgumentOutOfRangeException(nameof(n), $"'{nameof(n)}' must be greater than zero.");
         }
 
-        T[] output = [.. input.Take(n)];
+        List<NGram<T>> nGrams = [];
+
+        ReadOnlySpan<T> output = input.Slice(0, Math.Min(n, input.Length));
 
         while (output.Length == n)
         {
-            yield return new NGram<T>(output);
+            nGrams.Add(new NGram<T>(output));
 
-            input = input.Skip(1);
-            output = [.. input.Take(n)];
+            input = input.Slice(1);
+            output = input.Slice(0, Math.Min(n, input.Length));
         }
+
+        return nGrams;
     }
 
+    internal static List<NGram<T>> CreateAllNGrams<T>(this Span<T> input, int minN, int maxN = -1)
+        where T : IEquatable<T>
+        => CreateAllNGrams((ReadOnlySpan<T>)input, minN, maxN);
+
+    /// <summary>
+    /// Create a sequence of all n-grams from the input sequence from minN to maxN.
+    /// </summary>
+    /// <param name="input">The input sequence of items.</param>
+    /// <param name="minN">The minimum size of n-gram.</param>
+    /// <param name="maxN">The maximum size of n-gram. If not specified, the default is to include up to length of the input.</param>
+    internal static List<NGram<T>> CreateAllNGrams<T>(this ReadOnlySpan<T> input, int minN, int maxN = -1)
+        where T : IEquatable<T>
+    {
+        _ = Throw.IfLessThanOrEqual(minN, 0, nameof(minN));
+
+        if (maxN < 0)
+        {
+            maxN = input.Length; // Update to use Length instead of Count()
+        }
+        else if (maxN < minN)
+        {
+            Throw.ArgumentOutOfRangeException(nameof(maxN), $"'{nameof(maxN)}' must be greater than or equal to '{nameof(minN)}'.");
+        }
+
+        List<NGram<T>> nGrams = [];
+
+        for (int i = 0; i <= input.Length - minN; i++)
+        {
+            for (int s = minN; s <= maxN && s <= input.Length - i; s++)
+            {
+                nGrams.Add(new NGram<T>(input.Slice(i, s)));
+            }
+        }
+
+        return nGrams;
+    }
 }
+