| | | 1 | | using System.Text.Json; |
| | | 2 | | |
| | | 3 | | using Microsoft.Extensions.AI; |
| | | 4 | | using Microsoft.Extensions.AI.Evaluation; |
| | | 5 | | |
| | | 6 | | using NexusLabs.Needlr.AgentFramework.Diagnostics; |
| | | 7 | | |
| | | 8 | | namespace NexusLabs.Needlr.AgentFramework.Evaluation; |
| | | 9 | | |
| | | 10 | | /// <summary> |
| | | 11 | | /// Deterministic evaluator that scores the tool-call trajectory of an agent run from |
| | | 12 | | /// the captured <see cref="IAgentRunDiagnostics"/> snapshot carried in an |
| | | 13 | | /// <see cref="AgentRunDiagnosticsContext"/>. |
| | | 14 | | /// </summary> |
| | | 15 | | /// <remarks> |
| | | 16 | | /// <para> |
| | | 17 | | /// This evaluator never contacts a language model. It reads the ordered |
| | | 18 | | /// <see cref="IAgentRunDiagnostics.ToolCalls"/> collection and produces: |
| | | 19 | | /// </para> |
| | | 20 | | /// <list type="bullet"> |
| | | 21 | | /// <item><description><c>Tool Calls Total</c> — total number of tool invocations.</description></item> |
| | | 22 | | /// <item><description><c>Tool Calls Failed</c> — count of tool invocations whose <see cref="ToolCallDiagnostics.Succe |
| | | 23 | | /// <item><description><c>Tool Call Sequence Gaps</c> — number of missing slots in the <see cref="ToolCallDiagnostics. |
| | | 24 | | /// <item><description><c>All Tool Calls Succeeded</c> — boolean rollup. <see langword="true"/> when every tool invoca |
| | | 25 | | /// <item><description><c>Consecutive Same-Tool Calls</c> — count of consecutive tool invocations with the same <see c |
| | | 26 | | /// <item><description><c>Per-Tool Failure Rate</c> — JSON string mapping each tool name to its failure rate (0.0–1.0) |
| | | 27 | | /// <item><description><c>Tool Call Latency P50</c> — 50th percentile of tool call durations in milliseconds (nearest- |
| | | 28 | | /// <item><description><c>Tool Call Latency P95</c> — 95th percentile of tool call durations in milliseconds (nearest- |
| | | 29 | | /// </list> |
| | | 30 | | /// <para> |
| | | 31 | | /// When no <see cref="AgentRunDiagnosticsContext"/> is present in the |
| | | 32 | | /// <c>additionalContext</c> collection, the evaluator returns an empty |
| | | 33 | | /// <see cref="EvaluationResult"/> — callers should treat that as "not applicable". |
| | | 34 | | /// </para> |
| | | 35 | | /// </remarks> |
| | | 36 | | public sealed class ToolCallTrajectoryEvaluator : IEvaluator |
| | | 37 | | { |
| | | 38 | | /// <summary>Metric name for the total tool-call count.</summary> |
| | | 39 | | public const string TotalMetricName = "Tool Calls Total"; |
| | | 40 | | |
| | | 41 | | /// <summary>Metric name for the failed tool-call count.</summary> |
| | | 42 | | public const string FailedMetricName = "Tool Calls Failed"; |
| | | 43 | | |
| | | 44 | | /// <summary>Metric name for the number of gaps in the recorded tool-call sequence.</summary> |
| | | 45 | | public const string SequenceGapsMetricName = "Tool Call Sequence Gaps"; |
| | | 46 | | |
| | | 47 | | /// <summary>Metric name for the boolean rollup indicating every tool call succeeded.</summary> |
| | | 48 | | public const string AllSucceededMetricName = "All Tool Calls Succeeded"; |
| | | 49 | | |
| | | 50 | | /// <summary>Metric name for the count of consecutive tool calls with the same tool name.</summary> |
| | | 51 | | public const string ConsecutiveSameToolMetricName = "Consecutive Same-Tool Calls"; |
| | | 52 | | |
| | | 53 | | /// <summary>Metric name for the JSON-formatted per-tool failure rate breakdown.</summary> |
| | | 54 | | public const string PerToolFailureRateMetricName = "Per-Tool Failure Rate"; |
| | | 55 | | |
| | | 56 | | /// <summary>Metric name for the 50th percentile tool-call latency in milliseconds.</summary> |
| | | 57 | | public const string LatencyP50MetricName = "Tool Call Latency P50"; |
| | | 58 | | |
| | | 59 | | /// <summary>Metric name for the 95th percentile tool-call latency in milliseconds.</summary> |
| | | 60 | | public const string LatencyP95MetricName = "Tool Call Latency P95"; |
| | | 61 | | |
| | | 62 | | /// <inheritdoc /> |
| | 0 | 63 | | public IReadOnlyCollection<string> EvaluationMetricNames { get; } = |
| | 16 | 64 | | [ |
| | 16 | 65 | | TotalMetricName, |
| | 16 | 66 | | FailedMetricName, |
| | 16 | 67 | | SequenceGapsMetricName, |
| | 16 | 68 | | AllSucceededMetricName, |
| | 16 | 69 | | ConsecutiveSameToolMetricName, |
| | 16 | 70 | | PerToolFailureRateMetricName, |
| | 16 | 71 | | LatencyP50MetricName, |
| | 16 | 72 | | LatencyP95MetricName, |
| | 16 | 73 | | ]; |
| | | 74 | | |
| | | 75 | | /// <inheritdoc /> |
| | | 76 | | public ValueTask<EvaluationResult> EvaluateAsync( |
| | | 77 | | IEnumerable<ChatMessage> messages, |
| | | 78 | | ChatResponse modelResponse, |
| | | 79 | | ChatConfiguration? chatConfiguration = null, |
| | | 80 | | IEnumerable<EvaluationContext>? additionalContext = null, |
| | | 81 | | CancellationToken cancellationToken = default) |
| | | 82 | | { |
| | 16 | 83 | | var diagnostics = additionalContext? |
| | 16 | 84 | | .OfType<AgentRunDiagnosticsContext>() |
| | 16 | 85 | | .FirstOrDefault()? |
| | 16 | 86 | | .Diagnostics; |
| | | 87 | | |
| | 16 | 88 | | if (diagnostics is null) |
| | | 89 | | { |
| | 1 | 90 | | return new ValueTask<EvaluationResult>(new EvaluationResult()); |
| | | 91 | | } |
| | | 92 | | |
| | 15 | 93 | | var toolCalls = diagnostics.ToolCalls; |
| | 15 | 94 | | var total = toolCalls.Count; |
| | 15 | 95 | | var failed = 0; |
| | 96 | 96 | | for (var i = 0; i < toolCalls.Count; i++) |
| | | 97 | | { |
| | 33 | 98 | | if (!toolCalls[i].Succeeded) |
| | | 99 | | { |
| | 3 | 100 | | failed++; |
| | | 101 | | } |
| | | 102 | | } |
| | | 103 | | |
| | 15 | 104 | | var gaps = CountSequenceGaps(toolCalls); |
| | 15 | 105 | | var allSucceeded = failed == 0; |
| | 15 | 106 | | var consecutiveSameTool = CountConsecutiveSameTool(toolCalls); |
| | 15 | 107 | | var perToolFailureRate = BuildPerToolFailureRate(toolCalls); |
| | 15 | 108 | | var (p50, p95) = ComputeLatencyPercentiles(toolCalls); |
| | | 109 | | |
| | 15 | 110 | | var totalMetric = new NumericMetric( |
| | 15 | 111 | | TotalMetricName, |
| | 15 | 112 | | value: total, |
| | 15 | 113 | | reason: total == 0 |
| | 15 | 114 | | ? "No tool calls were recorded for this agent run." |
| | 15 | 115 | | : $"{total} tool call(s) were recorded."); |
| | | 116 | | |
| | 15 | 117 | | var failedMetric = new NumericMetric( |
| | 15 | 118 | | FailedMetricName, |
| | 15 | 119 | | value: failed, |
| | 15 | 120 | | reason: failed == 0 |
| | 15 | 121 | | ? "All recorded tool calls succeeded." |
| | 15 | 122 | | : $"{failed} of {total} recorded tool call(s) failed."); |
| | | 123 | | |
| | 15 | 124 | | var gapsMetric = new NumericMetric( |
| | 15 | 125 | | SequenceGapsMetricName, |
| | 15 | 126 | | value: gaps, |
| | 15 | 127 | | reason: gaps == 0 |
| | 15 | 128 | | ? "The tool-call sequence is contiguous starting at 0." |
| | 15 | 129 | | : $"{gaps} gap(s) detected in the tool-call sequence."); |
| | | 130 | | |
| | 15 | 131 | | var allSucceededMetric = new BooleanMetric( |
| | 15 | 132 | | AllSucceededMetricName, |
| | 15 | 133 | | value: allSucceeded, |
| | 15 | 134 | | reason: allSucceeded |
| | 15 | 135 | | ? "Every recorded tool call reported success." |
| | 15 | 136 | | : "At least one recorded tool call reported failure."); |
| | | 137 | | |
| | 15 | 138 | | var consecutiveMetric = new NumericMetric( |
| | 15 | 139 | | ConsecutiveSameToolMetricName, |
| | 15 | 140 | | value: consecutiveSameTool, |
| | 15 | 141 | | reason: consecutiveSameTool == 0 |
| | 15 | 142 | | ? "No consecutive same-tool calls detected." |
| | 15 | 143 | | : $"{consecutiveSameTool} consecutive same-tool call(s) detected (heuristic — may include valid parallel |
| | | 144 | | |
| | 15 | 145 | | var failureRateMetric = new StringMetric( |
| | 15 | 146 | | PerToolFailureRateMetricName, |
| | 15 | 147 | | value: perToolFailureRate, |
| | 15 | 148 | | reason: total == 0 |
| | 15 | 149 | | ? "No tool calls to compute failure rates." |
| | 15 | 150 | | : "Per-tool failure rates as JSON (tool name → failure rate 0.0–1.0)."); |
| | | 151 | | |
| | 15 | 152 | | var p50Metric = new NumericMetric( |
| | 15 | 153 | | LatencyP50MetricName, |
| | 15 | 154 | | value: p50, |
| | 15 | 155 | | reason: total == 0 |
| | 15 | 156 | | ? "No tool calls to compute latency." |
| | 15 | 157 | | : $"50th percentile tool-call latency: {p50:F1}ms."); |
| | | 158 | | |
| | 15 | 159 | | var p95Metric = new NumericMetric( |
| | 15 | 160 | | LatencyP95MetricName, |
| | 15 | 161 | | value: p95, |
| | 15 | 162 | | reason: total == 0 |
| | 15 | 163 | | ? "No tool calls to compute latency." |
| | 15 | 164 | | : $"95th percentile tool-call latency: {p95:F1}ms."); |
| | | 165 | | |
| | 15 | 166 | | return new ValueTask<EvaluationResult>(new EvaluationResult( |
| | 15 | 167 | | totalMetric, |
| | 15 | 168 | | failedMetric, |
| | 15 | 169 | | gapsMetric, |
| | 15 | 170 | | allSucceededMetric, |
| | 15 | 171 | | consecutiveMetric, |
| | 15 | 172 | | failureRateMetric, |
| | 15 | 173 | | p50Metric, |
| | 15 | 174 | | p95Metric)); |
| | | 175 | | } |
| | | 176 | | |
| | | 177 | | private static int CountSequenceGaps(IReadOnlyList<ToolCallDiagnostics> toolCalls) |
| | | 178 | | { |
| | 15 | 179 | | if (toolCalls.Count == 0) |
| | | 180 | | { |
| | 4 | 181 | | return 0; |
| | | 182 | | } |
| | | 183 | | |
| | 11 | 184 | | var sequences = new int[toolCalls.Count]; |
| | 88 | 185 | | for (var i = 0; i < toolCalls.Count; i++) |
| | | 186 | | { |
| | 33 | 187 | | sequences[i] = toolCalls[i].Sequence; |
| | | 188 | | } |
| | 11 | 189 | | Array.Sort(sequences); |
| | | 190 | | |
| | 11 | 191 | | var gaps = 0; |
| | 11 | 192 | | var expected = sequences[0]; |
| | 88 | 193 | | for (var i = 0; i < sequences.Length; i++) |
| | | 194 | | { |
| | 33 | 195 | | var actual = sequences[i]; |
| | 33 | 196 | | if (actual > expected) |
| | | 197 | | { |
| | 2 | 198 | | gaps += actual - expected; |
| | | 199 | | } |
| | 33 | 200 | | expected = actual + 1; |
| | | 201 | | } |
| | | 202 | | |
| | 11 | 203 | | return gaps; |
| | | 204 | | } |
| | | 205 | | |
| | | 206 | | private static int CountConsecutiveSameTool(IReadOnlyList<ToolCallDiagnostics> toolCalls) |
| | | 207 | | { |
| | 15 | 208 | | if (toolCalls.Count <= 1) |
| | | 209 | | { |
| | 5 | 210 | | return 0; |
| | | 211 | | } |
| | | 212 | | |
| | 10 | 213 | | var count = 0; |
| | 64 | 214 | | for (var i = 1; i < toolCalls.Count; i++) |
| | | 215 | | { |
| | 22 | 216 | | if (string.Equals(toolCalls[i].ToolName, toolCalls[i - 1].ToolName, StringComparison.Ordinal)) |
| | | 217 | | { |
| | 4 | 218 | | count++; |
| | | 219 | | } |
| | | 220 | | } |
| | | 221 | | |
| | 10 | 222 | | return count; |
| | | 223 | | } |
| | | 224 | | |
| | | 225 | | private static string BuildPerToolFailureRate(IReadOnlyList<ToolCallDiagnostics> toolCalls) |
| | | 226 | | { |
| | 15 | 227 | | if (toolCalls.Count == 0) |
| | | 228 | | { |
| | 4 | 229 | | return "{}"; |
| | | 230 | | } |
| | | 231 | | |
| | 11 | 232 | | var totals = new SortedDictionary<string, int>(StringComparer.Ordinal); |
| | 11 | 233 | | var failures = new SortedDictionary<string, int>(StringComparer.Ordinal); |
| | | 234 | | |
| | 88 | 235 | | for (var i = 0; i < toolCalls.Count; i++) |
| | | 236 | | { |
| | 33 | 237 | | var name = toolCalls[i].ToolName; |
| | 33 | 238 | | totals.TryGetValue(name, out var t); |
| | 33 | 239 | | totals[name] = t + 1; |
| | | 240 | | |
| | 33 | 241 | | if (!toolCalls[i].Succeeded) |
| | | 242 | | { |
| | 3 | 243 | | failures.TryGetValue(name, out var f); |
| | 3 | 244 | | failures[name] = f + 1; |
| | | 245 | | } |
| | | 246 | | } |
| | | 247 | | |
| | 11 | 248 | | var rates = new SortedDictionary<string, double>(StringComparer.Ordinal); |
| | 78 | 249 | | foreach (var kvp in totals) |
| | | 250 | | { |
| | 28 | 251 | | failures.TryGetValue(kvp.Key, out var f); |
| | 28 | 252 | | rates[kvp.Key] = (double)f / kvp.Value; |
| | | 253 | | } |
| | | 254 | | |
| | 11 | 255 | | return JsonSerializer.Serialize(rates); |
| | | 256 | | } |
| | | 257 | | |
| | | 258 | | private static (double P50, double P95) ComputeLatencyPercentiles( |
| | | 259 | | IReadOnlyList<ToolCallDiagnostics> toolCalls) |
| | | 260 | | { |
| | 15 | 261 | | if (toolCalls.Count == 0) |
| | | 262 | | { |
| | 4 | 263 | | return (0, 0); |
| | | 264 | | } |
| | | 265 | | |
| | 11 | 266 | | var durations = new double[toolCalls.Count]; |
| | 88 | 267 | | for (var i = 0; i < toolCalls.Count; i++) |
| | | 268 | | { |
| | 33 | 269 | | durations[i] = toolCalls[i].Duration.TotalMilliseconds; |
| | | 270 | | } |
| | 11 | 271 | | Array.Sort(durations); |
| | | 272 | | |
| | 11 | 273 | | return (NearestRankPercentile(durations, 50), NearestRankPercentile(durations, 95)); |
| | | 274 | | } |
| | | 275 | | |
| | | 276 | | private static double NearestRankPercentile(double[] sorted, int percentile) |
| | | 277 | | { |
| | 22 | 278 | | var index = (int)Math.Ceiling(percentile / 100.0 * sorted.Length) - 1; |
| | 22 | 279 | | return sorted[Math.Clamp(index, 0, sorted.Length - 1)]; |
| | | 280 | | } |
| | | 281 | | } |