| | | 1 | | using Microsoft.Extensions.AI; |
| | | 2 | | using Microsoft.Extensions.AI.Evaluation; |
| | | 3 | | |
| | | 4 | | using NexusLabs.Needlr.AgentFramework.Diagnostics; |
| | | 5 | | |
| | | 6 | | namespace NexusLabs.Needlr.AgentFramework.Evaluation; |
| | | 7 | | |
| | | 8 | | /// <summary> |
| | | 9 | | /// Deterministic evaluator that scores token usage and cost breakdown per stage of a |
| | | 10 | | /// pipeline run from the captured <see cref="IPipelineRunResult"/> snapshot carried in a |
| | | 11 | | /// <see cref="PipelineEvaluationContext"/>. |
| | | 12 | | /// </summary> |
| | | 13 | | /// <remarks> |
| | | 14 | | /// <para> |
| | | 15 | | /// This evaluator never contacts a language model. It reads |
| | | 16 | | /// <see cref="IPipelineRunResult.AggregateTokenUsage"/> and per-stage |
| | | 17 | | /// <see cref="IAgentRunDiagnostics.AggregateTokenUsage"/> to produce: |
| | | 18 | | /// </para> |
| | | 19 | | /// <list type="bullet"> |
| | | 20 | | /// <item><description><c>pipeline.total_tokens</c> — sum of all stage tokens.</description></item> |
| | | 21 | | /// <item><description><c>pipeline.total_input_tokens</c> — aggregate input tokens.</description></item> |
| | | 22 | | /// <item><description><c>pipeline.total_output_tokens</c> — aggregate output tokens.</description></item> |
| | | 23 | | /// <item><description><c>pipeline.stage_count</c> — number of stages in the pipeline.</description></item> |
| | | 24 | | /// <item><description><c>pipeline.stages_with_diagnostics</c> — count of stages that have non-null diagnostics.</desc |
| | | 25 | | /// <item><description><c>pipeline.most_expensive_stage</c> — name of the stage with the most tokens.</description></i |
| | | 26 | | /// <item><description><c>pipeline.most_expensive_stage_pct</c> — percentage of total tokens used by the most expensiv |
| | | 27 | | /// </list> |
| | | 28 | | /// <para> |
| | | 29 | | /// When no <see cref="PipelineEvaluationContext"/> is present in the |
| | | 30 | | /// <c>additionalContext</c> collection, the evaluator returns an empty |
| | | 31 | | /// <see cref="EvaluationResult"/> — callers should treat that as "not applicable". |
| | | 32 | | /// </para> |
| | | 33 | | /// </remarks> |
| | | 34 | | public sealed class PipelineCostEvaluator : IEvaluator |
| | | 35 | | { |
| | | 36 | | /// <summary>Metric name for the total token count across all stages.</summary> |
| | | 37 | | public const string TotalTokensMetricName = "pipeline.total_tokens"; |
| | | 38 | | |
| | | 39 | | /// <summary>Metric name for the total input token count.</summary> |
| | | 40 | | public const string TotalInputTokensMetricName = "pipeline.total_input_tokens"; |
| | | 41 | | |
| | | 42 | | /// <summary>Metric name for the total output token count.</summary> |
| | | 43 | | public const string TotalOutputTokensMetricName = "pipeline.total_output_tokens"; |
| | | 44 | | |
| | | 45 | | /// <summary>Metric name for the number of stages in the pipeline.</summary> |
| | | 46 | | public const string StageCountMetricName = "pipeline.stage_count"; |
| | | 47 | | |
| | | 48 | | /// <summary>Metric name for the count of stages that have diagnostics.</summary> |
| | | 49 | | public const string StagesWithDiagnosticsMetricName = "pipeline.stages_with_diagnostics"; |
| | | 50 | | |
| | | 51 | | /// <summary>Metric name for the name of the most expensive stage by token count.</summary> |
| | | 52 | | public const string MostExpensiveStageMetricName = "pipeline.most_expensive_stage"; |
| | | 53 | | |
| | | 54 | | /// <summary>Metric name for the percentage of total tokens used by the most expensive stage.</summary> |
| | | 55 | | public const string MostExpensiveStagePctMetricName = "pipeline.most_expensive_stage_pct"; |
| | | 56 | | |
| | | 57 | | /// <inheritdoc /> |
| | 0 | 58 | | public IReadOnlyCollection<string> EvaluationMetricNames { get; } = |
| | 5 | 59 | | [ |
| | 5 | 60 | | TotalTokensMetricName, |
| | 5 | 61 | | TotalInputTokensMetricName, |
| | 5 | 62 | | TotalOutputTokensMetricName, |
| | 5 | 63 | | StageCountMetricName, |
| | 5 | 64 | | StagesWithDiagnosticsMetricName, |
| | 5 | 65 | | MostExpensiveStageMetricName, |
| | 5 | 66 | | MostExpensiveStagePctMetricName, |
| | 5 | 67 | | ]; |
| | | 68 | | |
| | | 69 | | /// <inheritdoc /> |
| | | 70 | | public ValueTask<EvaluationResult> EvaluateAsync( |
| | | 71 | | IEnumerable<ChatMessage> messages, |
| | | 72 | | ChatResponse modelResponse, |
| | | 73 | | ChatConfiguration? chatConfiguration = null, |
| | | 74 | | IEnumerable<EvaluationContext>? additionalContext = null, |
| | | 75 | | CancellationToken cancellationToken = default) |
| | | 76 | | { |
| | 5 | 77 | | var pipelineResult = additionalContext? |
| | 5 | 78 | | .OfType<PipelineEvaluationContext>() |
| | 5 | 79 | | .FirstOrDefault()? |
| | 5 | 80 | | .PipelineResult; |
| | | 81 | | |
| | 5 | 82 | | if (pipelineResult is null) |
| | | 83 | | { |
| | 1 | 84 | | return new ValueTask<EvaluationResult>(new EvaluationResult()); |
| | | 85 | | } |
| | | 86 | | |
| | 4 | 87 | | var stages = pipelineResult.Stages; |
| | 4 | 88 | | var stageCount = stages.Count; |
| | | 89 | | |
| | 4 | 90 | | long totalTokens = 0; |
| | 4 | 91 | | long totalInputTokens = 0; |
| | 4 | 92 | | long totalOutputTokens = 0; |
| | 4 | 93 | | var stagesWithDiagnostics = 0; |
| | 4 | 94 | | string? mostExpensiveStageName = null; |
| | 4 | 95 | | long mostExpensiveStageTokens = 0; |
| | | 96 | | |
| | 28 | 97 | | for (var i = 0; i < stages.Count; i++) |
| | | 98 | | { |
| | 10 | 99 | | var stage = stages[i]; |
| | 10 | 100 | | if (stage.Diagnostics is null) |
| | | 101 | | { |
| | | 102 | | continue; |
| | | 103 | | } |
| | | 104 | | |
| | 7 | 105 | | stagesWithDiagnostics++; |
| | 7 | 106 | | var usage = stage.Diagnostics.AggregateTokenUsage; |
| | 7 | 107 | | totalTokens += usage.TotalTokens; |
| | 7 | 108 | | totalInputTokens += usage.InputTokens; |
| | 7 | 109 | | totalOutputTokens += usage.OutputTokens; |
| | | 110 | | |
| | 7 | 111 | | if (usage.TotalTokens > mostExpensiveStageTokens) |
| | | 112 | | { |
| | 6 | 113 | | mostExpensiveStageTokens = usage.TotalTokens; |
| | 6 | 114 | | mostExpensiveStageName = stage.AgentName; |
| | | 115 | | } |
| | | 116 | | } |
| | | 117 | | |
| | 4 | 118 | | var mostExpensivePct = totalTokens > 0 |
| | 4 | 119 | | ? (double)mostExpensiveStageTokens / totalTokens * 100.0 |
| | 4 | 120 | | : 0; |
| | | 121 | | |
| | 4 | 122 | | return new ValueTask<EvaluationResult>(new EvaluationResult( |
| | 4 | 123 | | new NumericMetric( |
| | 4 | 124 | | TotalTokensMetricName, |
| | 4 | 125 | | value: totalTokens, |
| | 4 | 126 | | reason: $"{totalTokens:N0} total tokens consumed across {stagesWithDiagnostics} stage(s) with diagnostic |
| | 4 | 127 | | new NumericMetric( |
| | 4 | 128 | | TotalInputTokensMetricName, |
| | 4 | 129 | | value: totalInputTokens, |
| | 4 | 130 | | reason: $"{totalInputTokens:N0} input tokens consumed."), |
| | 4 | 131 | | new NumericMetric( |
| | 4 | 132 | | TotalOutputTokensMetricName, |
| | 4 | 133 | | value: totalOutputTokens, |
| | 4 | 134 | | reason: $"{totalOutputTokens:N0} output tokens consumed."), |
| | 4 | 135 | | new NumericMetric( |
| | 4 | 136 | | StageCountMetricName, |
| | 4 | 137 | | value: stageCount, |
| | 4 | 138 | | reason: $"Pipeline has {stageCount} stage(s)."), |
| | 4 | 139 | | new NumericMetric( |
| | 4 | 140 | | StagesWithDiagnosticsMetricName, |
| | 4 | 141 | | value: stagesWithDiagnostics, |
| | 4 | 142 | | reason: stagesWithDiagnostics == stageCount |
| | 4 | 143 | | ? "All stages have diagnostics." |
| | 4 | 144 | | : $"{stagesWithDiagnostics} of {stageCount} stage(s) have diagnostics."), |
| | 4 | 145 | | new StringMetric( |
| | 4 | 146 | | MostExpensiveStageMetricName, |
| | 4 | 147 | | value: mostExpensiveStageName ?? string.Empty, |
| | 4 | 148 | | reason: mostExpensiveStageName is not null |
| | 4 | 149 | | ? $"Stage '{mostExpensiveStageName}' used the most tokens ({mostExpensiveStageTokens:N0})." |
| | 4 | 150 | | : "No stages have diagnostics to determine the most expensive stage."), |
| | 4 | 151 | | new NumericMetric( |
| | 4 | 152 | | MostExpensiveStagePctMetricName, |
| | 4 | 153 | | value: mostExpensivePct, |
| | 4 | 154 | | reason: mostExpensiveStageName is not null |
| | 4 | 155 | | ? $"Stage '{mostExpensiveStageName}' consumed {mostExpensivePct:F1}% of total tokens." |
| | 4 | 156 | | : "No stages have diagnostics to compute percentage."))); |
| | | 157 | | } |
| | | 158 | | } |