| | | 1 | | using Microsoft.Extensions.AI; |
| | | 2 | | using Microsoft.Extensions.AI.Evaluation; |
| | | 3 | | |
| | | 4 | | using NexusLabs.Needlr.AgentFramework.Diagnostics; |
| | | 5 | | |
| | | 6 | | namespace NexusLabs.Needlr.AgentFramework.Evaluation; |
| | | 7 | | |
| | | 8 | | /// <summary> |
| | | 9 | | /// Deterministic evaluator that scores whether an agent run terminated appropriately, |
| | | 10 | | /// using the captured <see cref="IAgentRunDiagnostics"/> snapshot carried in an |
| | | 11 | | /// <see cref="AgentRunDiagnosticsContext"/>. |
| | | 12 | | /// </summary> |
| | | 13 | | /// <remarks> |
| | | 14 | | /// <para> |
| | | 15 | | /// When the <see cref="AgentRunDiagnosticsContext"/> is present, the evaluator emits: |
| | | 16 | | /// </para> |
| | | 17 | | /// <list type="bullet"> |
| | | 18 | | /// <item><description><c>Run Succeeded</c> — boolean; mirrors <see cref="IAgentRunDiagnostics.Succeeded"/>.</descript |
| | | 19 | | /// <item><description><c>Termination Consistent</c> — boolean; <see langword="true"/> when <c>Succeeded</c> is consis |
| | | 20 | | /// <item><description><c>Execution Mode</c> — string; mirrors <see cref="IAgentRunDiagnostics.ExecutionMode"/>, or <c |
| | | 21 | | /// </list> |
| | | 22 | | /// <para> |
| | | 23 | | /// When no <see cref="AgentRunDiagnosticsContext"/> is present, the evaluator returns |
| | | 24 | | /// an empty <see cref="EvaluationResult"/>. |
| | | 25 | | /// </para> |
| | | 26 | | /// </remarks> |
| | | 27 | | public sealed class TerminationAppropriatenessEvaluator : IEvaluator |
| | | 28 | | { |
| | | 29 | | /// <summary>Metric name for the success rollup.</summary> |
| | | 30 | | public const string RunSucceededMetricName = "Run Succeeded"; |
| | | 31 | | |
| | | 32 | | /// <summary>Metric name for the success/error consistency check.</summary> |
| | | 33 | | public const string TerminationConsistentMetricName = "Termination Consistent"; |
| | | 34 | | |
| | | 35 | | /// <summary>Metric name for the captured execution mode string.</summary> |
| | | 36 | | public const string ExecutionModeMetricName = "Execution Mode"; |
| | | 37 | | |
| | | 38 | | /// <summary>Execution mode string emitted when the diagnostics do not carry one.</summary> |
| | | 39 | | public const string UnknownExecutionMode = "Unknown"; |
| | | 40 | | |
| | | 41 | | /// <inheritdoc /> |
| | 0 | 42 | | public IReadOnlyCollection<string> EvaluationMetricNames { get; } = |
| | 7 | 43 | | [ |
| | 7 | 44 | | RunSucceededMetricName, |
| | 7 | 45 | | TerminationConsistentMetricName, |
| | 7 | 46 | | ExecutionModeMetricName, |
| | 7 | 47 | | ]; |
| | | 48 | | |
| | | 49 | | /// <inheritdoc /> |
| | | 50 | | public ValueTask<EvaluationResult> EvaluateAsync( |
| | | 51 | | IEnumerable<ChatMessage> messages, |
| | | 52 | | ChatResponse modelResponse, |
| | | 53 | | ChatConfiguration? chatConfiguration = null, |
| | | 54 | | IEnumerable<EvaluationContext>? additionalContext = null, |
| | | 55 | | CancellationToken cancellationToken = default) |
| | | 56 | | { |
| | 7 | 57 | | var diagnostics = additionalContext? |
| | 7 | 58 | | .OfType<AgentRunDiagnosticsContext>() |
| | 7 | 59 | | .FirstOrDefault()? |
| | 7 | 60 | | .Diagnostics; |
| | | 61 | | |
| | 7 | 62 | | if (diagnostics is null) |
| | | 63 | | { |
| | 1 | 64 | | return new ValueTask<EvaluationResult>(new EvaluationResult()); |
| | | 65 | | } |
| | | 66 | | |
| | 6 | 67 | | var runSucceeded = diagnostics.Succeeded; |
| | 6 | 68 | | var hasErrorMessage = !string.IsNullOrEmpty(diagnostics.ErrorMessage); |
| | 6 | 69 | | var terminationConsistent = runSucceeded != hasErrorMessage; |
| | | 70 | | |
| | 6 | 71 | | var runSucceededMetric = new BooleanMetric( |
| | 6 | 72 | | RunSucceededMetricName, |
| | 6 | 73 | | value: runSucceeded, |
| | 6 | 74 | | reason: runSucceeded |
| | 6 | 75 | | ? "The agent run reported success." |
| | 6 | 76 | | : $"The agent run failed: {diagnostics.ErrorMessage ?? "no error message captured"}."); |
| | | 77 | | |
| | 6 | 78 | | var terminationConsistentMetric = new BooleanMetric( |
| | 6 | 79 | | TerminationConsistentMetricName, |
| | 6 | 80 | | value: terminationConsistent, |
| | 6 | 81 | | reason: terminationConsistent |
| | 6 | 82 | | ? "Success flag is consistent with the presence/absence of an error message." |
| | 6 | 83 | | : runSucceeded |
| | 6 | 84 | | ? "The run reported success but an error message was also captured." |
| | 6 | 85 | | : "The run reported failure but no error message was captured."); |
| | | 86 | | |
| | 6 | 87 | | var executionMode = string.IsNullOrEmpty(diagnostics.ExecutionMode) |
| | 6 | 88 | | ? UnknownExecutionMode |
| | 6 | 89 | | : diagnostics.ExecutionMode!; |
| | | 90 | | |
| | 6 | 91 | | var executionModeMetric = new StringMetric( |
| | 6 | 92 | | ExecutionModeMetricName, |
| | 6 | 93 | | value: executionMode, |
| | 6 | 94 | | reason: $"The captured execution mode was '{executionMode}'."); |
| | | 95 | | |
| | 6 | 96 | | return new ValueTask<EvaluationResult>(new EvaluationResult( |
| | 6 | 97 | | runSucceededMetric, |
| | 6 | 98 | | terminationConsistentMetric, |
| | 6 | 99 | | executionModeMetric)); |
| | | 100 | | } |
| | | 101 | | } |