| | | 1 | | using System.Globalization; |
| | | 2 | | using System.Text.Json; |
| | | 3 | | using System.Text.RegularExpressions; |
| | | 4 | | |
| | | 5 | | using Microsoft.Extensions.AI; |
| | | 6 | | using Microsoft.Extensions.AI.Evaluation; |
| | | 7 | | |
| | | 8 | | using NexusLabs.Needlr.AgentFramework.Diagnostics; |
| | | 9 | | |
| | | 10 | | namespace NexusLabs.Needlr.AgentFramework.Evaluation; |
| | | 11 | | |
| | | 12 | | /// <summary> |
| | | 13 | | /// LLM-judged evaluator that assesses whether an agent actually accomplished |
| | | 14 | | /// the task it was given. Unlike MEAI's <c>TaskAdherenceEvaluator</c> (which |
| | | 15 | | /// checks instruction following), this evaluator checks <em>task success</em>: |
| | | 16 | | /// did the agent produce output that satisfies the original request? |
| | | 17 | | /// </summary> |
| | | 18 | | /// <remarks> |
| | | 19 | | /// <para> |
| | | 20 | | /// This evaluator requires a <see cref="ChatConfiguration"/> with a judge |
| | | 21 | | /// <see cref="IChatClient"/>. It sends the original prompt and agent output |
| | | 22 | | /// to the judge with a structured evaluation prompt and parses the response. |
| | | 23 | | /// </para> |
| | | 24 | | /// <para> |
| | | 25 | | /// When no judge is configured (<c>chatConfiguration</c> is null |
| | | 26 | | /// or has no <see cref="ChatConfiguration.ChatClient"/>), the evaluator |
| | | 27 | | /// returns an empty <see cref="EvaluationResult"/>. |
| | | 28 | | /// </para> |
| | | 29 | | /// <para> |
| | | 30 | | /// Metrics produced: |
| | | 31 | | /// </para> |
| | | 32 | | /// <list type="bullet"> |
| | | 33 | | /// <item><description><c>Task Completed</c> — boolean. <see langword="true"/> when the |
| | | 34 | | /// judge determines the agent accomplished the requested task.</description></item> |
| | | 35 | | /// <item><description><c>Task Completion Score</c> — numeric (1–5). How completely and |
| | | 36 | | /// correctly the agent fulfilled the request. 5 = fully complete, 1 = not started or |
| | | 37 | | /// completely wrong.</description></item> |
| | | 38 | | /// <item><description><c>Task Completion Reasoning</c> — string. The judge's |
| | | 39 | | /// explanation for the score.</description></item> |
| | | 40 | | /// </list> |
| | | 41 | | /// </remarks> |
| | | 42 | | public sealed partial class TaskCompletionEvaluator : IEvaluator |
| | | 43 | | { |
| | | 44 | | /// <summary>Metric name for the boolean task-completed flag.</summary> |
| | | 45 | | public const string TaskCompletedMetricName = "Task Completed"; |
| | | 46 | | |
| | | 47 | | /// <summary>Metric name for the numeric 1–5 completion score.</summary> |
| | | 48 | | public const string TaskCompletionScoreMetricName = "Task Completion Score"; |
| | | 49 | | |
| | | 50 | | /// <summary>Metric name for the judge's reasoning.</summary> |
| | | 51 | | public const string TaskCompletionReasoningMetricName = "Task Completion Reasoning"; |
| | | 52 | | |
| | | 53 | | /// <summary>Score threshold at or above which the task is considered completed.</summary> |
| | | 54 | | public const int CompletionThreshold = 3; |
| | | 55 | | |
| | | 56 | | private const string SystemPrompt = """ |
| | | 57 | | You are an evaluation judge. Your job is to assess whether an AI agent |
| | | 58 | | completed a task it was given. |
| | | 59 | | |
| | | 60 | | You will be given: |
| | | 61 | | 1. The original task/request (USER PROMPT) |
| | | 62 | | 2. The agent's final output (AGENT OUTPUT) |
| | | 63 | | |
| | | 64 | | Rate the agent's task completion on a scale of 1-5: |
| | | 65 | | - 5: Task fully completed with correct, comprehensive output |
| | | 66 | | - 4: Task mostly completed with minor gaps or issues |
| | | 67 | | - 3: Task partially completed — core intent addressed but significant gaps |
| | | 68 | | - 2: Task barely started — some relevant content but far from complete |
| | | 69 | | - 1: Task not completed — output is wrong, empty, or irrelevant |
| | | 70 | | |
| | | 71 | | Respond with EXACTLY this format (no markdown, no extra text): |
| | | 72 | | SCORE: <number 1-5> |
| | | 73 | | COMPLETED: <YES or NO> |
| | | 74 | | REASONING: <one paragraph explaining your assessment> |
| | | 75 | | """; |
| | | 76 | | |
| | | 77 | | /// <inheritdoc /> |
| | 0 | 78 | | public IReadOnlyCollection<string> EvaluationMetricNames { get; } = |
| | 5 | 79 | | [ |
| | 5 | 80 | | TaskCompletedMetricName, |
| | 5 | 81 | | TaskCompletionScoreMetricName, |
| | 5 | 82 | | TaskCompletionReasoningMetricName, |
| | 5 | 83 | | ]; |
| | | 84 | | |
| | | 85 | | /// <inheritdoc /> |
| | | 86 | | public async ValueTask<EvaluationResult> EvaluateAsync( |
| | | 87 | | IEnumerable<ChatMessage> messages, |
| | | 88 | | ChatResponse modelResponse, |
| | | 89 | | ChatConfiguration? chatConfiguration = null, |
| | | 90 | | IEnumerable<EvaluationContext>? additionalContext = null, |
| | | 91 | | CancellationToken cancellationToken = default) |
| | | 92 | | { |
| | 5 | 93 | | if (chatConfiguration?.ChatClient is null) |
| | | 94 | | { |
| | 1 | 95 | | return new EvaluationResult(); |
| | | 96 | | } |
| | | 97 | | |
| | 4 | 98 | | var userPrompt = ExtractUserPrompt(messages); |
| | 4 | 99 | | var agentOutput = modelResponse.Text ?? string.Empty; |
| | | 100 | | |
| | 4 | 101 | | var diagnosticsContext = additionalContext? |
| | 4 | 102 | | .OfType<AgentRunDiagnosticsContext>() |
| | 4 | 103 | | .FirstOrDefault()? |
| | 4 | 104 | | .Diagnostics; |
| | | 105 | | |
| | 4 | 106 | | var evaluationPrompt = BuildEvaluationPrompt(userPrompt, agentOutput, diagnosticsContext); |
| | | 107 | | |
| | 4 | 108 | | var judgeMessages = new ChatMessage[] |
| | 4 | 109 | | { |
| | 4 | 110 | | new(ChatRole.System, SystemPrompt), |
| | 4 | 111 | | new(ChatRole.User, evaluationPrompt), |
| | 4 | 112 | | }; |
| | | 113 | | |
| | 4 | 114 | | var judgeResponse = await chatConfiguration.ChatClient |
| | 4 | 115 | | .GetResponseAsync(judgeMessages, cancellationToken: cancellationToken) |
| | 4 | 116 | | .ConfigureAwait(false); |
| | | 117 | | |
| | 4 | 118 | | var judgeText = judgeResponse.Text ?? string.Empty; |
| | 4 | 119 | | var (score, completed, reasoning) = ParseJudgeResponse(judgeText); |
| | | 120 | | |
| | 4 | 121 | | return new EvaluationResult( |
| | 4 | 122 | | new BooleanMetric( |
| | 4 | 123 | | TaskCompletedMetricName, |
| | 4 | 124 | | value: completed, |
| | 4 | 125 | | reason: completed |
| | 4 | 126 | | ? "The judge determined the agent accomplished the task." |
| | 4 | 127 | | : "The judge determined the agent did not accomplish the task."), |
| | 4 | 128 | | new NumericMetric( |
| | 4 | 129 | | TaskCompletionScoreMetricName, |
| | 4 | 130 | | value: score, |
| | 4 | 131 | | reason: $"Score {score}/5 (threshold for completion: {CompletionThreshold})."), |
| | 4 | 132 | | new StringMetric( |
| | 4 | 133 | | TaskCompletionReasoningMetricName, |
| | 4 | 134 | | value: reasoning, |
| | 4 | 135 | | reason: "The judge's explanation for the task completion assessment.")); |
| | 5 | 136 | | } |
| | | 137 | | |
| | | 138 | | private static string ExtractUserPrompt(IEnumerable<ChatMessage> messages) |
| | | 139 | | { |
| | 12 | 140 | | foreach (var msg in messages) |
| | | 141 | | { |
| | 4 | 142 | | if (msg.Role == ChatRole.User && !string.IsNullOrWhiteSpace(msg.Text)) |
| | | 143 | | { |
| | 4 | 144 | | return msg.Text; |
| | | 145 | | } |
| | | 146 | | } |
| | | 147 | | |
| | 0 | 148 | | return "(no user prompt provided)"; |
| | 4 | 149 | | } |
| | | 150 | | |
| | | 151 | | private static string BuildEvaluationPrompt( |
| | | 152 | | string userPrompt, |
| | | 153 | | string agentOutput, |
| | | 154 | | IAgentRunDiagnostics? diagnostics) |
| | | 155 | | { |
| | 4 | 156 | | var prompt = $""" |
| | 4 | 157 | | USER PROMPT: |
| | 4 | 158 | | {userPrompt} |
| | 4 | 159 | | |
| | 4 | 160 | | AGENT OUTPUT: |
| | 4 | 161 | | {(string.IsNullOrWhiteSpace(agentOutput) ? "(empty — the agent produced no text output)" : agentOutput)} |
| | 4 | 162 | | """; |
| | | 163 | | |
| | 4 | 164 | | if (diagnostics is not null) |
| | | 165 | | { |
| | 1 | 166 | | prompt += $""" |
| | 1 | 167 | | |
| | 1 | 168 | | |
| | 1 | 169 | | ADDITIONAL CONTEXT: |
| | 1 | 170 | | - Tool calls made: {diagnostics.ToolCalls.Count} |
| | 0 | 171 | | - Tool calls failed: {diagnostics.ToolCalls.Count(t => !t.Succeeded)} |
| | 1 | 172 | | - Execution mode: {diagnostics.ExecutionMode ?? "unknown"} |
| | 1 | 173 | | - Agent reported success: {diagnostics.Succeeded} |
| | 1 | 174 | | """; |
| | | 175 | | } |
| | | 176 | | |
| | 4 | 177 | | return prompt; |
| | | 178 | | } |
| | | 179 | | |
| | | 180 | | internal static (int Score, bool Completed, string Reasoning) ParseJudgeResponse(string response) |
| | | 181 | | { |
| | 12 | 182 | | var score = 1; |
| | 12 | 183 | | var completed = false; |
| | 12 | 184 | | var reasoning = "Unable to parse judge response."; |
| | | 185 | | |
| | 12 | 186 | | var scoreMatch = ScorePattern().Match(response); |
| | 12 | 187 | | if (scoreMatch.Success && |
| | 12 | 188 | | int.TryParse(scoreMatch.Groups[1].Value, CultureInfo.InvariantCulture, out var parsedScore)) |
| | | 189 | | { |
| | 11 | 190 | | score = Math.Clamp(parsedScore, 1, 5); |
| | | 191 | | } |
| | | 192 | | |
| | 12 | 193 | | var completedMatch = CompletedPattern().Match(response); |
| | 12 | 194 | | if (completedMatch.Success) |
| | | 195 | | { |
| | 9 | 196 | | completed = string.Equals( |
| | 9 | 197 | | completedMatch.Groups[1].Value.Trim(), |
| | 9 | 198 | | "YES", |
| | 9 | 199 | | StringComparison.OrdinalIgnoreCase); |
| | | 200 | | } |
| | | 201 | | else |
| | | 202 | | { |
| | 3 | 203 | | completed = score >= CompletionThreshold; |
| | | 204 | | } |
| | | 205 | | |
| | 12 | 206 | | var reasoningMatch = ReasoningPattern().Match(response); |
| | 12 | 207 | | if (reasoningMatch.Success) |
| | | 208 | | { |
| | 11 | 209 | | reasoning = reasoningMatch.Groups[1].Value.Trim(); |
| | | 210 | | } |
| | | 211 | | |
| | 12 | 212 | | return (score, completed, reasoning); |
| | | 213 | | } |
| | | 214 | | |
| | | 215 | | [GeneratedRegex(@"SCORE:\s*(\d)", RegexOptions.IgnoreCase)] |
| | | 216 | | private static partial Regex ScorePattern(); |
| | | 217 | | |
| | | 218 | | [GeneratedRegex(@"COMPLETED:\s*(YES|NO)", RegexOptions.IgnoreCase)] |
| | | 219 | | private static partial Regex CompletedPattern(); |
| | | 220 | | |
| | | 221 | | [GeneratedRegex(@"REASONING:\s*(.+)", RegexOptions.IgnoreCase | RegexOptions.Singleline)] |
| | | 222 | | private static partial Regex ReasoningPattern(); |
| | | 223 | | } |