| | | 1 | | using Microsoft.Extensions.AI.Evaluation; |
| | | 2 | | |
| | | 3 | | namespace NexusLabs.Needlr.AgentFramework.Evaluation; |
| | | 4 | | |
| | | 5 | | /// <summary> |
| | | 6 | | /// Configurable quality gate that asserts evaluation metrics meet defined |
| | | 7 | | /// thresholds. Designed for CI pipelines — call <see cref="Assert"/> after |
| | | 8 | | /// running evaluators to fail the build when metrics regress. |
| | | 9 | | /// </summary> |
| | | 10 | | /// <remarks> |
| | | 11 | | /// <para> |
| | | 12 | | /// Thresholds are defined fluently via <see cref="RequireNumericMax"/>, |
| | | 13 | | /// <see cref="RequireNumericMin"/>, and <see cref="RequireBoolean"/>. Each |
| | | 14 | | /// threshold names a metric (using the <c>*MetricName</c> constants from |
| | | 15 | | /// evaluator classes) and a bound. <see cref="Assert"/> checks all thresholds |
| | | 16 | | /// against the evaluation result and throws |
| | | 17 | | /// <see cref="QualityGateFailedException"/> listing every violation. |
| | | 18 | | /// </para> |
| | | 19 | | /// <para> |
| | | 20 | | /// Metrics not present in the <see cref="EvaluationResult"/> are silently |
| | | 21 | | /// skipped — this allows a gate to be used with evaluators that conditionally |
| | | 22 | | /// emit metrics (e.g., <see cref="IterationCoherenceEvaluator"/> only emits |
| | | 23 | | /// when execution mode is <c>IterativeLoop</c>). |
| | | 24 | | /// </para> |
| | | 25 | | /// </remarks> |
| | | 26 | | /// <example> |
| | | 27 | | /// <code> |
| | | 28 | | /// var gate = new EvaluationQualityGate() |
| | | 29 | | /// .RequireBoolean(ToolCallTrajectoryEvaluator.AllSucceededMetricName, expected: true) |
| | | 30 | | /// .RequireBoolean(IterationCoherenceEvaluator.TerminatedCoherentlyMetricName, expected: true) |
| | | 31 | | /// .RequireNumericMax(EfficiencyEvaluator.TotalTokensMetricName, max: 50_000) |
| | | 32 | | /// .RequireBoolean(EfficiencyEvaluator.UnderBudgetMetricName, expected: true); |
| | | 33 | | /// |
| | | 34 | | /// // Throws QualityGateFailedException if any threshold is violated. |
| | | 35 | | /// gate.Assert(trajectoryResult, coherenceResult, efficiencyResult); |
| | | 36 | | /// </code> |
| | | 37 | | /// </example> |
| | | 38 | | public sealed class EvaluationQualityGate |
| | | 39 | | { |
| | 13 | 40 | | private readonly List<Threshold> _thresholds = []; |
| | | 41 | | |
| | | 42 | | /// <summary> |
| | | 43 | | /// Requires a <see cref="NumericMetric"/> to be at most <paramref name="max"/>. |
| | | 44 | | /// </summary> |
| | | 45 | | /// <param name="metricName">The metric name (use evaluator <c>*MetricName</c> constants).</param> |
| | | 46 | | /// <param name="max">The maximum allowed value (inclusive).</param> |
| | | 47 | | /// <returns>This gate instance for fluent chaining.</returns> |
| | | 48 | | public EvaluationQualityGate RequireNumericMax(string metricName, double max) |
| | | 49 | | { |
| | 9 | 50 | | _thresholds.Add(new NumericMaxThreshold(metricName, max)); |
| | 9 | 51 | | return this; |
| | | 52 | | } |
| | | 53 | | |
| | | 54 | | /// <summary> |
| | | 55 | | /// Requires a <see cref="NumericMetric"/> to be at least <paramref name="min"/>. |
| | | 56 | | /// </summary> |
| | | 57 | | /// <param name="metricName">The metric name (use evaluator <c>*MetricName</c> constants).</param> |
| | | 58 | | /// <param name="min">The minimum allowed value (inclusive).</param> |
| | | 59 | | /// <returns>This gate instance for fluent chaining.</returns> |
| | | 60 | | public EvaluationQualityGate RequireNumericMin(string metricName, double min) |
| | | 61 | | { |
| | 3 | 62 | | _thresholds.Add(new NumericMinThreshold(metricName, min)); |
| | 3 | 63 | | return this; |
| | | 64 | | } |
| | | 65 | | |
| | | 66 | | /// <summary> |
| | | 67 | | /// Requires a <see cref="BooleanMetric"/> to equal <paramref name="expected"/>. |
| | | 68 | | /// </summary> |
| | | 69 | | /// <param name="metricName">The metric name (use evaluator <c>*MetricName</c> constants).</param> |
| | | 70 | | /// <param name="expected">The required boolean value.</param> |
| | | 71 | | /// <returns>This gate instance for fluent chaining.</returns> |
| | | 72 | | public EvaluationQualityGate RequireBoolean(string metricName, bool expected) |
| | | 73 | | { |
| | 5 | 74 | | _thresholds.Add(new BooleanThreshold(metricName, expected)); |
| | 5 | 75 | | return this; |
| | | 76 | | } |
| | | 77 | | |
| | | 78 | | /// <summary> |
| | | 79 | | /// Checks all thresholds against the provided evaluation results. Metrics |
| | | 80 | | /// are looked up across all results — the first match wins. |
| | | 81 | | /// </summary> |
| | | 82 | | /// <param name="results">One or more <see cref="EvaluationResult"/> instances to check.</param> |
| | | 83 | | /// <exception cref="QualityGateFailedException"> |
| | | 84 | | /// Thrown when one or more thresholds are violated. The exception message |
| | | 85 | | /// lists every violation. |
| | | 86 | | /// </exception> |
| | | 87 | | public void Assert(params EvaluationResult[] results) |
| | | 88 | | { |
| | 13 | 89 | | ArgumentNullException.ThrowIfNull(results); |
| | | 90 | | |
| | 13 | 91 | | var violations = new List<string>(); |
| | 60 | 92 | | foreach (var threshold in _thresholds) |
| | | 93 | | { |
| | 17 | 94 | | EvaluationMetric? metric = null; |
| | 55 | 95 | | foreach (var result in results) |
| | | 96 | | { |
| | 18 | 97 | | if (result.Metrics.TryGetValue(threshold.MetricName, out var found)) |
| | | 98 | | { |
| | 15 | 99 | | metric = found; |
| | 15 | 100 | | break; |
| | | 101 | | } |
| | | 102 | | } |
| | | 103 | | |
| | 17 | 104 | | if (metric is null) |
| | | 105 | | { |
| | | 106 | | continue; |
| | | 107 | | } |
| | | 108 | | |
| | 15 | 109 | | var violation = threshold.Check(metric); |
| | 15 | 110 | | if (violation is not null) |
| | | 111 | | { |
| | 6 | 112 | | violations.Add(violation); |
| | | 113 | | } |
| | | 114 | | } |
| | | 115 | | |
| | 13 | 116 | | if (violations.Count > 0) |
| | | 117 | | { |
| | 4 | 118 | | throw new QualityGateFailedException(violations); |
| | | 119 | | } |
| | 9 | 120 | | } |
| | | 121 | | |
| | | 122 | | private abstract class Threshold |
| | | 123 | | { |
| | 24 | 124 | | public string MetricName { get; } |
| | | 125 | | |
| | 17 | 126 | | protected Threshold(string metricName) |
| | | 127 | | { |
| | 17 | 128 | | ArgumentException.ThrowIfNullOrWhiteSpace(metricName); |
| | 17 | 129 | | MetricName = metricName; |
| | 17 | 130 | | } |
| | | 131 | | |
| | | 132 | | public abstract string? Check(EvaluationMetric metric); |
| | | 133 | | } |
| | | 134 | | |
| | 9 | 135 | | private sealed class NumericMaxThreshold(string metricName, double max) : Threshold(metricName) |
| | | 136 | | { |
| | | 137 | | public override string? Check(EvaluationMetric metric) |
| | | 138 | | { |
| | 8 | 139 | | if (metric is NumericMetric nm && nm.Value.HasValue && nm.Value.Value > max) |
| | | 140 | | { |
| | 2 | 141 | | return $"{MetricName}: {nm.Value.Value:G} exceeded max {max:G}"; |
| | | 142 | | } |
| | 6 | 143 | | return null; |
| | | 144 | | } |
| | | 145 | | } |
| | | 146 | | |
| | 3 | 147 | | private sealed class NumericMinThreshold(string metricName, double min) : Threshold(metricName) |
| | | 148 | | { |
| | | 149 | | public override string? Check(EvaluationMetric metric) |
| | | 150 | | { |
| | 3 | 151 | | if (metric is NumericMetric nm && nm.Value.HasValue && nm.Value.Value < min) |
| | | 152 | | { |
| | 2 | 153 | | return $"{MetricName}: {nm.Value.Value:G} below min {min:G}"; |
| | | 154 | | } |
| | 1 | 155 | | return null; |
| | | 156 | | } |
| | | 157 | | } |
| | | 158 | | |
| | 5 | 159 | | private sealed class BooleanThreshold(string metricName, bool expected) : Threshold(metricName) |
| | | 160 | | { |
| | | 161 | | public override string? Check(EvaluationMetric metric) |
| | | 162 | | { |
| | 4 | 163 | | if (metric is BooleanMetric bm && bm.Value.HasValue && bm.Value.Value != expected) |
| | | 164 | | { |
| | 2 | 165 | | return $"{MetricName}: expected {expected}, got {bm.Value.Value}"; |
| | | 166 | | } |
| | 2 | 167 | | return null; |
| | | 168 | | } |
| | | 169 | | } |
| | | 170 | | } |