| | | 1 | | using Microsoft.Extensions.AI; |
| | | 2 | | using Microsoft.Extensions.AI.Evaluation; |
| | | 3 | | |
| | | 4 | | using NexusLabs.Needlr.AgentFramework.Diagnostics; |
| | | 5 | | |
| | | 6 | | namespace NexusLabs.Needlr.AgentFramework.Evaluation; |
| | | 7 | | |
| | | 8 | | /// <summary> |
| | | 9 | | /// Deterministic evaluator that scores the iteration coherence of an iterative-loop |
| | | 10 | | /// agent run from the captured <see cref="IAgentRunDiagnostics"/> snapshot carried in |
| | | 11 | | /// an <see cref="AgentRunDiagnosticsContext"/>. |
| | | 12 | | /// </summary> |
| | | 13 | | /// <remarks> |
| | | 14 | | /// <para> |
| | | 15 | | /// This evaluator only produces metrics when |
| | | 16 | | /// <see cref="IAgentRunDiagnostics.ExecutionMode"/> is <c>"IterativeLoop"</c>. For any |
| | | 17 | | /// other execution mode (or when the context is missing) the evaluator returns an |
| | | 18 | | /// empty <see cref="EvaluationResult"/>, which callers should treat as "not applicable". |
| | | 19 | | /// </para> |
| | | 20 | | /// <para> |
| | | 21 | | /// When applicable, the evaluator emits: |
| | | 22 | | /// </para> |
| | | 23 | | /// <list type="bullet"> |
| | | 24 | | /// <item><description><c>Iteration Count</c> — number of LLM iterations, derived from <see cref="IAgentRunDiagnostics |
| | | 25 | | /// <item><description><c>Iteration Empty Outputs</c> — number of iterations whose <see cref="ChatCompletionDiagnostic |
| | | 26 | | /// <item><description><c>Terminated Coherently</c> — boolean rollup. <see langword="true"/> when the run succeeded, p |
| | | 27 | | /// <item><description><c>Iteration Efficiency Ratio</c> — ratio of useful iterations (produced text output or trigger |
| | | 28 | | /// <item><description><c>Degenerate Loop Detected</c> — boolean. <see langword="true"/> when two or more consecutive |
| | | 29 | | /// <item><description><c>Max Iterations Hit</c> — boolean. <see langword="true"/> when the iteration count reached or |
| | | 30 | | /// </list> |
| | | 31 | | /// </remarks> |
| | | 32 | | public sealed class IterationCoherenceEvaluator : IEvaluator |
| | | 33 | | { |
| | | 34 | | /// <summary>The execution mode value that gates this evaluator.</summary> |
| | | 35 | | public const string IterativeLoopExecutionMode = "IterativeLoop"; |
| | | 36 | | |
| | | 37 | | /// <summary>Metric name for the iteration count.</summary> |
| | | 38 | | public const string IterationCountMetricName = "Iteration Count"; |
| | | 39 | | |
| | | 40 | | /// <summary>Metric name for the count of iterations with empty output.</summary> |
| | | 41 | | public const string EmptyOutputsMetricName = "Iteration Empty Outputs"; |
| | | 42 | | |
| | | 43 | | /// <summary>Metric name for the boolean rollup indicating coherent termination.</summary> |
| | | 44 | | public const string TerminatedCoherentlyMetricName = "Terminated Coherently"; |
| | | 45 | | |
| | | 46 | | /// <summary>Metric name for the ratio of useful iterations to total iterations.</summary> |
| | | 47 | | public const string EfficiencyRatioMetricName = "Iteration Efficiency Ratio"; |
| | | 48 | | |
| | | 49 | | /// <summary>Metric name for the boolean indicating a degenerate (repeated-output) loop.</summary> |
| | | 50 | | public const string DegenerateLoopMetricName = "Degenerate Loop Detected"; |
| | | 51 | | |
| | | 52 | | /// <summary>Metric name for the boolean indicating the iteration count reached maxIterations.</summary> |
| | | 53 | | public const string MaxIterationsHitMetricName = "Max Iterations Hit"; |
| | | 54 | | |
| | | 55 | | private readonly int? _maxIterations; |
| | | 56 | | |
| | | 57 | | /// <summary> |
| | | 58 | | /// Creates a new <see cref="IterationCoherenceEvaluator"/>. |
| | | 59 | | /// </summary> |
| | | 60 | | /// <param name="maxIterations"> |
| | | 61 | | /// Optional expected iteration limit. When provided, the evaluator emits the |
| | | 62 | | /// <see cref="MaxIterationsHitMetricName"/> metric. When <see langword="null"/>, |
| | | 63 | | /// the metric is omitted. |
| | | 64 | | /// </param> |
| | 18 | 65 | | public IterationCoherenceEvaluator(int? maxIterations = null) |
| | | 66 | | { |
| | 18 | 67 | | _maxIterations = maxIterations; |
| | | 68 | | |
| | 18 | 69 | | var names = new List<string> |
| | 18 | 70 | | { |
| | 18 | 71 | | IterationCountMetricName, |
| | 18 | 72 | | EmptyOutputsMetricName, |
| | 18 | 73 | | TerminatedCoherentlyMetricName, |
| | 18 | 74 | | EfficiencyRatioMetricName, |
| | 18 | 75 | | DegenerateLoopMetricName, |
| | 18 | 76 | | }; |
| | 18 | 77 | | if (maxIterations.HasValue) |
| | | 78 | | { |
| | 3 | 79 | | names.Add(MaxIterationsHitMetricName); |
| | | 80 | | } |
| | 18 | 81 | | EvaluationMetricNames = names; |
| | 18 | 82 | | } |
| | | 83 | | |
| | | 84 | | /// <inheritdoc /> |
| | 0 | 85 | | public IReadOnlyCollection<string> EvaluationMetricNames { get; } |
| | | 86 | | |
| | | 87 | | /// <inheritdoc /> |
| | | 88 | | public ValueTask<EvaluationResult> EvaluateAsync( |
| | | 89 | | IEnumerable<ChatMessage> messages, |
| | | 90 | | ChatResponse modelResponse, |
| | | 91 | | ChatConfiguration? chatConfiguration = null, |
| | | 92 | | IEnumerable<EvaluationContext>? additionalContext = null, |
| | | 93 | | CancellationToken cancellationToken = default) |
| | | 94 | | { |
| | 18 | 95 | | var diagnostics = additionalContext? |
| | 18 | 96 | | .OfType<AgentRunDiagnosticsContext>() |
| | 18 | 97 | | .FirstOrDefault()? |
| | 18 | 98 | | .Diagnostics; |
| | | 99 | | |
| | 18 | 100 | | if (diagnostics is null || |
| | 18 | 101 | | !string.Equals(diagnostics.ExecutionMode, IterativeLoopExecutionMode, StringComparison.Ordinal)) |
| | | 102 | | { |
| | 3 | 103 | | return new ValueTask<EvaluationResult>(new EvaluationResult()); |
| | | 104 | | } |
| | | 105 | | |
| | 15 | 106 | | var completions = diagnostics.ChatCompletions; |
| | 15 | 107 | | var iterationCount = completions.Count; |
| | 15 | 108 | | var emptyOutputs = 0; |
| | 15 | 109 | | var usefulIterations = 0; |
| | 96 | 110 | | for (var i = 0; i < completions.Count; i++) |
| | | 111 | | { |
| | 33 | 112 | | var hasTextOutput = completions[i].ResponseCharCount > 0; |
| | 33 | 113 | | var hasFunctionCalls = completions[i].Response?.Messages |
| | 38 | 114 | | .Any(m => m.Contents.OfType<FunctionCallContent>().Any()) ?? false; |
| | | 115 | | |
| | 33 | 116 | | if (!hasTextOutput) |
| | | 117 | | { |
| | 3 | 118 | | emptyOutputs++; |
| | | 119 | | } |
| | | 120 | | |
| | 33 | 121 | | if (hasTextOutput || hasFunctionCalls) |
| | | 122 | | { |
| | 31 | 123 | | usefulIterations++; |
| | | 124 | | } |
| | | 125 | | } |
| | | 126 | | |
| | 15 | 127 | | var finalIterationProducedOutput = |
| | 15 | 128 | | iterationCount > 0 && completions[iterationCount - 1].ResponseCharCount > 0; |
| | 15 | 129 | | var terminatedCoherently = |
| | 15 | 130 | | diagnostics.Succeeded && |
| | 15 | 131 | | iterationCount > 0 && |
| | 15 | 132 | | finalIterationProducedOutput; |
| | 15 | 133 | | var efficiencyRatio = iterationCount > 0 |
| | 15 | 134 | | ? (double)usefulIterations / iterationCount |
| | 15 | 135 | | : 0; |
| | 15 | 136 | | var degenerateLoop = DetectDegenerateLoop(completions); |
| | | 137 | | |
| | 15 | 138 | | var metrics = new List<EvaluationMetric> |
| | 15 | 139 | | { |
| | 15 | 140 | | new NumericMetric( |
| | 15 | 141 | | IterationCountMetricName, |
| | 15 | 142 | | value: iterationCount, |
| | 15 | 143 | | reason: iterationCount == 0 |
| | 15 | 144 | | ? "No iterations were recorded." |
| | 15 | 145 | | : $"{iterationCount} iteration(s) were recorded."), |
| | 15 | 146 | | |
| | 15 | 147 | | new NumericMetric( |
| | 15 | 148 | | EmptyOutputsMetricName, |
| | 15 | 149 | | value: emptyOutputs, |
| | 15 | 150 | | reason: emptyOutputs == 0 |
| | 15 | 151 | | ? "Every iteration produced non-empty output." |
| | 15 | 152 | | : $"{emptyOutputs} of {iterationCount} iteration(s) produced empty output."), |
| | 15 | 153 | | |
| | 15 | 154 | | new BooleanMetric( |
| | 15 | 155 | | TerminatedCoherentlyMetricName, |
| | 15 | 156 | | value: terminatedCoherently, |
| | 15 | 157 | | reason: terminatedCoherently |
| | 15 | 158 | | ? "The iterative loop succeeded and the final iteration produced output." |
| | 15 | 159 | | : BuildIncoherentReason(diagnostics, iterationCount, finalIterationProducedOutput)), |
| | 15 | 160 | | |
| | 15 | 161 | | new NumericMetric( |
| | 15 | 162 | | EfficiencyRatioMetricName, |
| | 15 | 163 | | value: efficiencyRatio, |
| | 15 | 164 | | reason: iterationCount == 0 |
| | 15 | 165 | | ? "No iterations to compute efficiency." |
| | 15 | 166 | | : $"{usefulIterations} of {iterationCount} iteration(s) were useful (produced text or triggered tool |
| | 15 | 167 | | |
| | 15 | 168 | | new BooleanMetric( |
| | 15 | 169 | | DegenerateLoopMetricName, |
| | 15 | 170 | | value: degenerateLoop, |
| | 15 | 171 | | reason: degenerateLoop |
| | 15 | 172 | | ? "Two or more consecutive iterations produced identical text output." |
| | 15 | 173 | | : "No consecutive duplicate outputs detected."), |
| | 15 | 174 | | }; |
| | | 175 | | |
| | 15 | 176 | | if (_maxIterations.HasValue) |
| | | 177 | | { |
| | 3 | 178 | | var hit = iterationCount >= _maxIterations.Value; |
| | 3 | 179 | | metrics.Add(new BooleanMetric( |
| | 3 | 180 | | MaxIterationsHitMetricName, |
| | 3 | 181 | | value: hit, |
| | 3 | 182 | | reason: hit |
| | 3 | 183 | | ? $"Iteration count ({iterationCount}) reached or exceeded the configured limit ({_maxIterations.Val |
| | 3 | 184 | | : $"Iteration count ({iterationCount}) is below the configured limit ({_maxIterations.Value}).")); |
| | | 185 | | } |
| | | 186 | | |
| | 15 | 187 | | return new ValueTask<EvaluationResult>(new EvaluationResult(metrics.ToArray())); |
| | | 188 | | } |
| | | 189 | | |
| | | 190 | | private static bool DetectDegenerateLoop(IReadOnlyList<ChatCompletionDiagnostics> completions) |
| | | 191 | | { |
| | 15 | 192 | | if (completions.Count < 2) |
| | | 193 | | { |
| | 5 | 194 | | return false; |
| | | 195 | | } |
| | | 196 | | |
| | 58 | 197 | | for (var i = 1; i < completions.Count; i++) |
| | | 198 | | { |
| | 20 | 199 | | var prevResponse = completions[i - 1].Response; |
| | 20 | 200 | | var currResponse = completions[i].Response; |
| | | 201 | | |
| | 20 | 202 | | if (prevResponse is null || currResponse is null) |
| | | 203 | | { |
| | | 204 | | continue; |
| | | 205 | | } |
| | | 206 | | |
| | 2 | 207 | | var prevText = GetAggregateText(prevResponse); |
| | 2 | 208 | | var currText = GetAggregateText(currResponse); |
| | | 209 | | |
| | 2 | 210 | | if (prevText is not null && |
| | 2 | 211 | | currText is not null && |
| | 2 | 212 | | string.Equals(prevText, currText, StringComparison.Ordinal)) |
| | | 213 | | { |
| | 1 | 214 | | return true; |
| | | 215 | | } |
| | | 216 | | } |
| | | 217 | | |
| | 9 | 218 | | return false; |
| | | 219 | | } |
| | | 220 | | |
| | | 221 | | private static string? GetAggregateText(ChatResponse response) |
| | | 222 | | { |
| | 4 | 223 | | if (response.Messages.Count == 0) |
| | | 224 | | { |
| | 0 | 225 | | return null; |
| | | 226 | | } |
| | | 227 | | |
| | 4 | 228 | | var text = response.Messages[response.Messages.Count - 1].Text; |
| | 4 | 229 | | return string.IsNullOrEmpty(text) ? null : text; |
| | | 230 | | } |
| | | 231 | | |
| | | 232 | | private static string BuildIncoherentReason( |
| | | 233 | | IAgentRunDiagnostics diagnostics, |
| | | 234 | | int iterationCount, |
| | | 235 | | bool finalIterationProducedOutput) |
| | | 236 | | { |
| | 5 | 237 | | if (!diagnostics.Succeeded) |
| | | 238 | | { |
| | 1 | 239 | | return "The agent run did not complete successfully."; |
| | | 240 | | } |
| | 4 | 241 | | if (iterationCount == 0) |
| | | 242 | | { |
| | 2 | 243 | | return "The agent run succeeded but recorded zero iterations."; |
| | | 244 | | } |
| | 2 | 245 | | if (!finalIterationProducedOutput) |
| | | 246 | | { |
| | 2 | 247 | | return "The final iteration produced no output."; |
| | | 248 | | } |
| | 0 | 249 | | return "Iterative-loop termination is incoherent."; |
| | | 250 | | } |
| | | 251 | | } |