< Summary

Line coverage
96%
Covered lines: 82
Uncovered lines: 3
Coverable lines: 85
Total lines: 321
Line coverage: 96.4%
Branch coverage
88%
Covered branches: 30
Total branches: 34
Branch coverage: 88.2%
Method coverage

Feature is only available for sponsors

Upgrade to PRO version

Metrics

MethodBranch coverage Crap Score Cyclomatic complexity Line coverage
File 1: ScorePattern()100%11100%
File 1: CompletedPattern()100%11100%
File 1: ReasoningPattern()100%11100%
File 2: get_EvaluationMetricNames()100%210%
File 2: .ctor()100%11100%
File 2: EvaluateAsync()78.57%1414100%
File 2: ExtractUserPrompt(...)83.33%6680%
File 2: BuildEvaluationPrompt(...)100%6694.44%
File 2: ParseJudgeResponse(...)100%88100%

File(s)

/_/src/NexusLabs.Needlr.AgentFramework.Evaluation/obj/Release/net10.0/System.Text.RegularExpressions.Generator/System.Text.RegularExpressions.Generator.RegexGenerator/RegexGenerator.g.cs

File '/_/src/NexusLabs.Needlr.AgentFramework.Evaluation/obj/Release/net10.0/System.Text.RegularExpressions.Generator/System.Text.RegularExpressions.Generator.RegexGenerator/RegexGenerator.g.cs' does not exist (any more).

/home/runner/work/needlr/needlr/src/NexusLabs.Needlr.AgentFramework.Evaluation/TaskCompletionEvaluator.cs

#LineLine coverage
 1using System.Globalization;
 2using System.Text.Json;
 3using System.Text.RegularExpressions;
 4
 5using Microsoft.Extensions.AI;
 6using Microsoft.Extensions.AI.Evaluation;
 7
 8using NexusLabs.Needlr.AgentFramework.Diagnostics;
 9
 10namespace NexusLabs.Needlr.AgentFramework.Evaluation;
 11
 12/// <summary>
 13/// LLM-judged evaluator that assesses whether an agent actually accomplished
 14/// the task it was given. Unlike MEAI's <c>TaskAdherenceEvaluator</c> (which
 15/// checks instruction following), this evaluator checks <em>task success</em>:
 16/// did the agent produce output that satisfies the original request?
 17/// </summary>
 18/// <remarks>
 19/// <para>
 20/// This evaluator requires a <see cref="ChatConfiguration"/> with a judge
 21/// <see cref="IChatClient"/>. It sends the original prompt and agent output
 22/// to the judge with a structured evaluation prompt and parses the response.
 23/// </para>
 24/// <para>
 25/// When no judge is configured (<c>chatConfiguration</c> is null
 26/// or has no <see cref="ChatConfiguration.ChatClient"/>), the evaluator
 27/// returns an empty <see cref="EvaluationResult"/>.
 28/// </para>
 29/// <para>
 30/// Metrics produced:
 31/// </para>
 32/// <list type="bullet">
 33///   <item><description><c>Task Completed</c> — boolean. <see langword="true"/> when the
 34///   judge determines the agent accomplished the requested task.</description></item>
 35///   <item><description><c>Task Completion Score</c> — numeric (1–5). How completely and
 36///   correctly the agent fulfilled the request. 5 = fully complete, 1 = not started or
 37///   completely wrong.</description></item>
 38///   <item><description><c>Task Completion Reasoning</c> — string. The judge's
 39///   explanation for the score.</description></item>
 40/// </list>
 41/// </remarks>
 42public sealed partial class TaskCompletionEvaluator : IEvaluator
 43{
 44    /// <summary>Metric name for the boolean task-completed flag.</summary>
 45    public const string TaskCompletedMetricName = "Task Completed";
 46
 47    /// <summary>Metric name for the numeric 1–5 completion score.</summary>
 48    public const string TaskCompletionScoreMetricName = "Task Completion Score";
 49
 50    /// <summary>Metric name for the judge's reasoning.</summary>
 51    public const string TaskCompletionReasoningMetricName = "Task Completion Reasoning";
 52
 53    /// <summary>Score threshold at or above which the task is considered completed.</summary>
 54    public const int CompletionThreshold = 3;
 55
 56    private const string SystemPrompt = """
 57        You are an evaluation judge. Your job is to assess whether an AI agent
 58        completed a task it was given.
 59
 60        You will be given:
 61        1. The original task/request (USER PROMPT)
 62        2. The agent's final output (AGENT OUTPUT)
 63
 64        Rate the agent's task completion on a scale of 1-5:
 65        - 5: Task fully completed with correct, comprehensive output
 66        - 4: Task mostly completed with minor gaps or issues
 67        - 3: Task partially completed — core intent addressed but significant gaps
 68        - 2: Task barely started — some relevant content but far from complete
 69        - 1: Task not completed — output is wrong, empty, or irrelevant
 70
 71        Respond with EXACTLY this format (no markdown, no extra text):
 72        SCORE: <number 1-5>
 73        COMPLETED: <YES or NO>
 74        REASONING: <one paragraph explaining your assessment>
 75        """;
 76
 77    /// <inheritdoc />
 078    public IReadOnlyCollection<string> EvaluationMetricNames { get; } =
 579    [
 580        TaskCompletedMetricName,
 581        TaskCompletionScoreMetricName,
 582        TaskCompletionReasoningMetricName,
 583    ];
 84
 85    /// <inheritdoc />
 86    public async ValueTask<EvaluationResult> EvaluateAsync(
 87        IEnumerable<ChatMessage> messages,
 88        ChatResponse modelResponse,
 89        ChatConfiguration? chatConfiguration = null,
 90        IEnumerable<EvaluationContext>? additionalContext = null,
 91        CancellationToken cancellationToken = default)
 92    {
 593        if (chatConfiguration?.ChatClient is null)
 94        {
 195            return new EvaluationResult();
 96        }
 97
 498        var userPrompt = ExtractUserPrompt(messages);
 499        var agentOutput = modelResponse.Text ?? string.Empty;
 100
 4101        var diagnosticsContext = additionalContext?
 4102            .OfType<AgentRunDiagnosticsContext>()
 4103            .FirstOrDefault()?
 4104            .Diagnostics;
 105
 4106        var evaluationPrompt = BuildEvaluationPrompt(userPrompt, agentOutput, diagnosticsContext);
 107
 4108        var judgeMessages = new ChatMessage[]
 4109        {
 4110            new(ChatRole.System, SystemPrompt),
 4111            new(ChatRole.User, evaluationPrompt),
 4112        };
 113
 4114        var judgeResponse = await chatConfiguration.ChatClient
 4115            .GetResponseAsync(judgeMessages, cancellationToken: cancellationToken)
 4116            .ConfigureAwait(false);
 117
 4118        var judgeText = judgeResponse.Text ?? string.Empty;
 4119        var (score, completed, reasoning) = ParseJudgeResponse(judgeText);
 120
 4121        return new EvaluationResult(
 4122            new BooleanMetric(
 4123                TaskCompletedMetricName,
 4124                value: completed,
 4125                reason: completed
 4126                    ? "The judge determined the agent accomplished the task."
 4127                    : "The judge determined the agent did not accomplish the task."),
 4128            new NumericMetric(
 4129                TaskCompletionScoreMetricName,
 4130                value: score,
 4131                reason: $"Score {score}/5 (threshold for completion: {CompletionThreshold})."),
 4132            new StringMetric(
 4133                TaskCompletionReasoningMetricName,
 4134                value: reasoning,
 4135                reason: "The judge's explanation for the task completion assessment."));
 5136    }
 137
 138    private static string ExtractUserPrompt(IEnumerable<ChatMessage> messages)
 139    {
 12140        foreach (var msg in messages)
 141        {
 4142            if (msg.Role == ChatRole.User && !string.IsNullOrWhiteSpace(msg.Text))
 143            {
 4144                return msg.Text;
 145            }
 146        }
 147
 0148        return "(no user prompt provided)";
 4149    }
 150
 151    private static string BuildEvaluationPrompt(
 152        string userPrompt,
 153        string agentOutput,
 154        IAgentRunDiagnostics? diagnostics)
 155    {
 4156        var prompt = $"""
 4157            USER PROMPT:
 4158            {userPrompt}
 4159
 4160            AGENT OUTPUT:
 4161            {(string.IsNullOrWhiteSpace(agentOutput) ? "(empty — the agent produced no text output)" : agentOutput)}
 4162            """;
 163
 4164        if (diagnostics is not null)
 165        {
 1166            prompt += $"""
 1167
 1168
 1169                ADDITIONAL CONTEXT:
 1170                - Tool calls made: {diagnostics.ToolCalls.Count}
 0171                - Tool calls failed: {diagnostics.ToolCalls.Count(t => !t.Succeeded)}
 1172                - Execution mode: {diagnostics.ExecutionMode ?? "unknown"}
 1173                - Agent reported success: {diagnostics.Succeeded}
 1174                """;
 175        }
 176
 4177        return prompt;
 178    }
 179
 180    internal static (int Score, bool Completed, string Reasoning) ParseJudgeResponse(string response)
 181    {
 12182        var score = 1;
 12183        var completed = false;
 12184        var reasoning = "Unable to parse judge response.";
 185
 12186        var scoreMatch = ScorePattern().Match(response);
 12187        if (scoreMatch.Success &&
 12188            int.TryParse(scoreMatch.Groups[1].Value, CultureInfo.InvariantCulture, out var parsedScore))
 189        {
 11190            score = Math.Clamp(parsedScore, 1, 5);
 191        }
 192
 12193        var completedMatch = CompletedPattern().Match(response);
 12194        if (completedMatch.Success)
 195        {
 9196            completed = string.Equals(
 9197                completedMatch.Groups[1].Value.Trim(),
 9198                "YES",
 9199                StringComparison.OrdinalIgnoreCase);
 200        }
 201        else
 202        {
 3203            completed = score >= CompletionThreshold;
 204        }
 205
 12206        var reasoningMatch = ReasoningPattern().Match(response);
 12207        if (reasoningMatch.Success)
 208        {
 11209            reasoning = reasoningMatch.Groups[1].Value.Trim();
 210        }
 211
 12212        return (score, completed, reasoning);
 213    }
 214
 215    [GeneratedRegex(@"SCORE:\s*(\d)", RegexOptions.IgnoreCase)]
 216    private static partial Regex ScorePattern();
 217
 218    [GeneratedRegex(@"COMPLETED:\s*(YES|NO)", RegexOptions.IgnoreCase)]
 219    private static partial Regex CompletedPattern();
 220
 221    [GeneratedRegex(@"REASONING:\s*(.+)", RegexOptions.IgnoreCase | RegexOptions.Singleline)]
 222    private static partial Regex ReasoningPattern();
 223}