| | | 1 | | using Microsoft.Extensions.AI; |
| | | 2 | | using Microsoft.Extensions.AI.Evaluation; |
| | | 3 | | |
| | | 4 | | using NexusLabs.Needlr.AgentFramework.Diagnostics; |
| | | 5 | | |
| | | 6 | | namespace NexusLabs.Needlr.AgentFramework.Evaluation; |
| | | 7 | | |
| | | 8 | | /// <summary> |
| | | 9 | | /// Deterministic evaluator that scores per-stage success/failure and overall pipeline |
| | | 10 | | /// health from the captured <see cref="IPipelineRunResult"/> snapshot carried in a |
| | | 11 | | /// <see cref="PipelineEvaluationContext"/>. |
| | | 12 | | /// </summary> |
| | | 13 | | /// <remarks> |
| | | 14 | | /// <para> |
| | | 15 | | /// This evaluator never contacts a language model. It reads the |
| | | 16 | | /// <see cref="IPipelineRunResult"/> to produce: |
| | | 17 | | /// </para> |
| | | 18 | | /// <list type="bullet"> |
| | | 19 | | /// <item><description><c>pipeline.succeeded</c> — whether the pipeline succeeded.</description></item> |
| | | 20 | | /// <item><description><c>pipeline.total_stages</c> — total number of stages.</description></item> |
| | | 21 | | /// <item><description><c>pipeline.completed_stages</c> — stages with non-null diagnostics.</description></item> |
| | | 22 | | /// <item><description><c>pipeline.skipped_stages</c> — stages with null diagnostics AND null response.</description>< |
| | | 23 | | /// <item><description><c>pipeline.total_duration_ms</c> — total pipeline duration in milliseconds.</description></ite |
| | | 24 | | /// <item><description><c>pipeline.error_message</c> — error message if the pipeline failed (nullable).</description>< |
| | | 25 | | /// </list> |
| | | 26 | | /// <para> |
| | | 27 | | /// When no <see cref="PipelineEvaluationContext"/> is present in the |
| | | 28 | | /// <c>additionalContext</c> collection, the evaluator returns an empty |
| | | 29 | | /// <see cref="EvaluationResult"/> — callers should treat that as "not applicable". |
| | | 30 | | /// </para> |
| | | 31 | | /// </remarks> |
| | | 32 | | public sealed class PipelineStageEvaluator : IEvaluator |
| | | 33 | | { |
| | | 34 | | /// <summary>Metric name for whether the pipeline succeeded.</summary> |
| | | 35 | | public const string SucceededMetricName = "pipeline.succeeded"; |
| | | 36 | | |
| | | 37 | | /// <summary>Metric name for the total number of stages.</summary> |
| | | 38 | | public const string TotalStagesMetricName = "pipeline.total_stages"; |
| | | 39 | | |
| | | 40 | | /// <summary>Metric name for the number of completed stages (those with diagnostics).</summary> |
| | | 41 | | public const string CompletedStagesMetricName = "pipeline.completed_stages"; |
| | | 42 | | |
| | | 43 | | /// <summary>Metric name for the number of skipped stages (null diagnostics AND null response).</summary> |
| | | 44 | | public const string SkippedStagesMetricName = "pipeline.skipped_stages"; |
| | | 45 | | |
| | | 46 | | /// <summary>Metric name for the total pipeline duration in milliseconds.</summary> |
| | | 47 | | public const string TotalDurationMsMetricName = "pipeline.total_duration_ms"; |
| | | 48 | | |
| | | 49 | | /// <summary>Metric name for the error message if the pipeline failed.</summary> |
| | | 50 | | public const string ErrorMessageMetricName = "pipeline.error_message"; |
| | | 51 | | |
| | | 52 | | /// <inheritdoc /> |
| | 0 | 53 | | public IReadOnlyCollection<string> EvaluationMetricNames { get; } = |
| | 6 | 54 | | [ |
| | 6 | 55 | | SucceededMetricName, |
| | 6 | 56 | | TotalStagesMetricName, |
| | 6 | 57 | | CompletedStagesMetricName, |
| | 6 | 58 | | SkippedStagesMetricName, |
| | 6 | 59 | | TotalDurationMsMetricName, |
| | 6 | 60 | | ErrorMessageMetricName, |
| | 6 | 61 | | ]; |
| | | 62 | | |
| | | 63 | | /// <inheritdoc /> |
| | | 64 | | public ValueTask<EvaluationResult> EvaluateAsync( |
| | | 65 | | IEnumerable<ChatMessage> messages, |
| | | 66 | | ChatResponse modelResponse, |
| | | 67 | | ChatConfiguration? chatConfiguration = null, |
| | | 68 | | IEnumerable<EvaluationContext>? additionalContext = null, |
| | | 69 | | CancellationToken cancellationToken = default) |
| | | 70 | | { |
| | 6 | 71 | | var pipelineResult = additionalContext? |
| | 6 | 72 | | .OfType<PipelineEvaluationContext>() |
| | 6 | 73 | | .FirstOrDefault()? |
| | 6 | 74 | | .PipelineResult; |
| | | 75 | | |
| | 6 | 76 | | if (pipelineResult is null) |
| | | 77 | | { |
| | 1 | 78 | | return new ValueTask<EvaluationResult>(new EvaluationResult()); |
| | | 79 | | } |
| | | 80 | | |
| | 5 | 81 | | var stages = pipelineResult.Stages; |
| | 5 | 82 | | var totalStages = pipelineResult.PlannedStageCount; |
| | 5 | 83 | | var completedStages = 0; |
| | 5 | 84 | | var skippedStages = 0; |
| | | 85 | | |
| | 28 | 86 | | for (var i = 0; i < stages.Count; i++) |
| | | 87 | | { |
| | 9 | 88 | | var stage = stages[i]; |
| | 9 | 89 | | if (stage.Diagnostics is not null) |
| | | 90 | | { |
| | 7 | 91 | | completedStages++; |
| | | 92 | | } |
| | 2 | 93 | | else if (stage.FinalResponse is null) |
| | | 94 | | { |
| | 2 | 95 | | skippedStages++; |
| | | 96 | | } |
| | | 97 | | } |
| | | 98 | | |
| | 5 | 99 | | var durationMs = pipelineResult.TotalDuration.TotalMilliseconds; |
| | 5 | 100 | | var succeeded = pipelineResult.Succeeded; |
| | 5 | 101 | | var errorMessage = pipelineResult.ErrorMessage; |
| | | 102 | | |
| | 5 | 103 | | var metrics = new List<EvaluationMetric> |
| | 5 | 104 | | { |
| | 5 | 105 | | new BooleanMetric( |
| | 5 | 106 | | SucceededMetricName, |
| | 5 | 107 | | value: succeeded, |
| | 5 | 108 | | reason: succeeded |
| | 5 | 109 | | ? "Pipeline completed successfully." |
| | 5 | 110 | | : "Pipeline did not complete successfully."), |
| | 5 | 111 | | new NumericMetric( |
| | 5 | 112 | | TotalStagesMetricName, |
| | 5 | 113 | | value: totalStages, |
| | 5 | 114 | | reason: $"Pipeline has {totalStages} stage(s)."), |
| | 5 | 115 | | new NumericMetric( |
| | 5 | 116 | | CompletedStagesMetricName, |
| | 5 | 117 | | value: completedStages, |
| | 5 | 118 | | reason: completedStages == totalStages |
| | 5 | 119 | | ? "All stages completed with diagnostics." |
| | 5 | 120 | | : $"{completedStages} of {totalStages} stage(s) completed with diagnostics."), |
| | 5 | 121 | | new NumericMetric( |
| | 5 | 122 | | SkippedStagesMetricName, |
| | 5 | 123 | | value: skippedStages, |
| | 5 | 124 | | reason: skippedStages == 0 |
| | 5 | 125 | | ? "No stages were skipped." |
| | 5 | 126 | | : $"{skippedStages} stage(s) were skipped (no diagnostics and no response)."), |
| | 5 | 127 | | new NumericMetric( |
| | 5 | 128 | | TotalDurationMsMetricName, |
| | 5 | 129 | | value: durationMs, |
| | 5 | 130 | | reason: $"Pipeline ran for {durationMs:F0}ms."), |
| | 5 | 131 | | new StringMetric( |
| | 5 | 132 | | ErrorMessageMetricName, |
| | 5 | 133 | | value: errorMessage, |
| | 5 | 134 | | reason: errorMessage is not null |
| | 5 | 135 | | ? $"Pipeline error: {errorMessage}" |
| | 5 | 136 | | : "No error occurred."), |
| | 5 | 137 | | }; |
| | | 138 | | |
| | 5 | 139 | | return new ValueTask<EvaluationResult>(new EvaluationResult(metrics.ToArray())); |
| | | 140 | | } |
| | | 141 | | } |