| | | 1 | | using Microsoft.Extensions.AI; |
| | | 2 | | using Microsoft.Extensions.AI.Evaluation; |
| | | 3 | | |
| | | 4 | | using NexusLabs.Needlr.AgentFramework.Diagnostics; |
| | | 5 | | |
| | | 6 | | namespace NexusLabs.Needlr.AgentFramework.Evaluation; |
| | | 7 | | |
| | | 8 | | /// <summary> |
| | | 9 | | /// Deterministic evaluator that scores the token efficiency and cost profile of an |
| | | 10 | | /// agent run from the captured <see cref="IAgentRunDiagnostics"/> snapshot carried in |
| | | 11 | | /// an <see cref="AgentRunDiagnosticsContext"/>. |
| | | 12 | | /// </summary> |
| | | 13 | | /// <remarks> |
| | | 14 | | /// <para> |
| | | 15 | | /// This evaluator never contacts a language model. It reads |
| | | 16 | | /// <see cref="IAgentRunDiagnostics.AggregateTokenUsage"/> and |
| | | 17 | | /// <see cref="IAgentRunDiagnostics.ToolCalls"/> to produce: |
| | | 18 | | /// </para> |
| | | 19 | | /// <list type="bullet"> |
| | | 20 | | /// <item><description><c>Total Tokens</c> — aggregate token count across all LLM calls.</description></item> |
| | | 21 | | /// <item><description><c>Input Token Ratio</c> — input tokens / total tokens. High values suggest verbose prompts; lo |
| | | 22 | | /// <item><description><c>Tokens Per Tool Call</c> — total tokens / tool call count. Measures the token cost of each t |
| | | 23 | | /// <item><description><c>Cache Hit Ratio</c> — cached input tokens / input tokens. Higher values mean more prompt-cac |
| | | 24 | | /// <item><description><c>Under Budget</c> — boolean. <see langword="true"/> when total tokens is strictly below the c |
| | | 25 | | /// </list> |
| | | 26 | | /// <para> |
| | | 27 | | /// When no <see cref="AgentRunDiagnosticsContext"/> is present in the |
| | | 28 | | /// <c>additionalContext</c> collection, the evaluator returns an empty |
| | | 29 | | /// <see cref="EvaluationResult"/> — callers should treat that as "not applicable". |
| | | 30 | | /// </para> |
| | | 31 | | /// </remarks> |
| | | 32 | | /// <example> |
| | | 33 | | /// <code> |
| | | 34 | | /// // Score efficiency with a 10,000-token budget |
| | | 35 | | /// var evaluator = new EfficiencyEvaluator(tokenBudget: 10_000); |
| | | 36 | | /// var result = await evaluator.EvaluateAsync( |
| | | 37 | | /// messages: Array.Empty<ChatMessage>(), |
| | | 38 | | /// modelResponse: new ChatResponse(), |
| | | 39 | | /// additionalContext: [new AgentRunDiagnosticsContext(diagnostics)]); |
| | | 40 | | /// |
| | | 41 | | /// var underBudget = ((BooleanMetric)result.Metrics["Under Budget"]).Value; |
| | | 42 | | /// var tokensPerTool = ((NumericMetric)result.Metrics["Tokens Per Tool Call"]).Value; |
| | | 43 | | /// </code> |
| | | 44 | | /// </example> |
| | | 45 | | public sealed class EfficiencyEvaluator : IEvaluator |
| | | 46 | | { |
| | | 47 | | /// <summary>Metric name for the aggregate token count.</summary> |
| | | 48 | | public const string TotalTokensMetricName = "Total Tokens"; |
| | | 49 | | |
| | | 50 | | /// <summary>Metric name for the input-to-total token ratio.</summary> |
| | | 51 | | public const string InputTokenRatioMetricName = "Input Token Ratio"; |
| | | 52 | | |
| | | 53 | | /// <summary>Metric name for tokens consumed per tool call.</summary> |
| | | 54 | | public const string TokensPerToolCallMetricName = "Tokens Per Tool Call"; |
| | | 55 | | |
| | | 56 | | /// <summary>Metric name for the prompt-cache hit ratio.</summary> |
| | | 57 | | public const string CacheHitRatioMetricName = "Cache Hit Ratio"; |
| | | 58 | | |
| | | 59 | | /// <summary>Metric name for the boolean budget check.</summary> |
| | | 60 | | public const string UnderBudgetMetricName = "Under Budget"; |
| | | 61 | | |
| | | 62 | | private readonly long? _tokenBudget; |
| | | 63 | | |
| | | 64 | | /// <summary> |
| | | 65 | | /// Creates a new <see cref="EfficiencyEvaluator"/>. |
| | | 66 | | /// </summary> |
| | | 67 | | /// <param name="tokenBudget"> |
| | | 68 | | /// Optional token budget. When provided, the evaluator emits the |
| | | 69 | | /// <see cref="UnderBudgetMetricName"/> metric. When <see langword="null"/>, |
| | | 70 | | /// the metric is omitted. |
| | | 71 | | /// </param> |
| | 12 | 72 | | public EfficiencyEvaluator(long? tokenBudget = null) |
| | | 73 | | { |
| | 12 | 74 | | _tokenBudget = tokenBudget; |
| | | 75 | | |
| | 12 | 76 | | var names = new List<string> |
| | 12 | 77 | | { |
| | 12 | 78 | | TotalTokensMetricName, |
| | 12 | 79 | | InputTokenRatioMetricName, |
| | 12 | 80 | | TokensPerToolCallMetricName, |
| | 12 | 81 | | CacheHitRatioMetricName, |
| | 12 | 82 | | }; |
| | 12 | 83 | | if (tokenBudget.HasValue) |
| | | 84 | | { |
| | 4 | 85 | | names.Add(UnderBudgetMetricName); |
| | | 86 | | } |
| | 12 | 87 | | EvaluationMetricNames = names; |
| | 12 | 88 | | } |
| | | 89 | | |
| | | 90 | | /// <inheritdoc /> |
| | 4 | 91 | | public IReadOnlyCollection<string> EvaluationMetricNames { get; } |
| | | 92 | | |
| | | 93 | | /// <inheritdoc /> |
| | | 94 | | public ValueTask<EvaluationResult> EvaluateAsync( |
| | | 95 | | IEnumerable<ChatMessage> messages, |
| | | 96 | | ChatResponse modelResponse, |
| | | 97 | | ChatConfiguration? chatConfiguration = null, |
| | | 98 | | IEnumerable<EvaluationContext>? additionalContext = null, |
| | | 99 | | CancellationToken cancellationToken = default) |
| | | 100 | | { |
| | 10 | 101 | | var diagnostics = additionalContext? |
| | 10 | 102 | | .OfType<AgentRunDiagnosticsContext>() |
| | 10 | 103 | | .FirstOrDefault()? |
| | 10 | 104 | | .Diagnostics; |
| | | 105 | | |
| | 10 | 106 | | if (diagnostics is null) |
| | | 107 | | { |
| | 1 | 108 | | return new ValueTask<EvaluationResult>(new EvaluationResult()); |
| | | 109 | | } |
| | | 110 | | |
| | 9 | 111 | | var usage = diagnostics.AggregateTokenUsage; |
| | 9 | 112 | | var totalTokens = usage.TotalTokens; |
| | 9 | 113 | | var inputTokens = usage.InputTokens; |
| | 9 | 114 | | var cachedInputTokens = usage.CachedInputTokens; |
| | 9 | 115 | | var toolCallCount = diagnostics.ToolCalls.Count; |
| | | 116 | | |
| | 9 | 117 | | var inputTokenRatio = totalTokens > 0 |
| | 9 | 118 | | ? (double)inputTokens / totalTokens |
| | 9 | 119 | | : 0; |
| | | 120 | | |
| | 9 | 121 | | var tokensPerToolCall = toolCallCount > 0 |
| | 9 | 122 | | ? (double)totalTokens / toolCallCount |
| | 9 | 123 | | : 0; |
| | | 124 | | |
| | 9 | 125 | | var cacheHitRatio = inputTokens > 0 |
| | 9 | 126 | | ? (double)cachedInputTokens / inputTokens |
| | 9 | 127 | | : 0; |
| | | 128 | | |
| | 9 | 129 | | var metrics = new List<EvaluationMetric> |
| | 9 | 130 | | { |
| | 9 | 131 | | new NumericMetric( |
| | 9 | 132 | | TotalTokensMetricName, |
| | 9 | 133 | | value: totalTokens, |
| | 9 | 134 | | reason: totalTokens == 0 |
| | 9 | 135 | | ? "No token usage was recorded." |
| | 9 | 136 | | : $"{totalTokens:N0} total tokens consumed ({inputTokens:N0} input, {usage.OutputTokens:N0} output). |
| | 9 | 137 | | |
| | 9 | 138 | | new NumericMetric( |
| | 9 | 139 | | InputTokenRatioMetricName, |
| | 9 | 140 | | value: inputTokenRatio, |
| | 9 | 141 | | reason: totalTokens == 0 |
| | 9 | 142 | | ? "No tokens to compute ratio." |
| | 9 | 143 | | : $"{inputTokenRatio:P1} of tokens were input ({inputTokens:N0} of {totalTokens:N0})."), |
| | 9 | 144 | | |
| | 9 | 145 | | new NumericMetric( |
| | 9 | 146 | | TokensPerToolCallMetricName, |
| | 9 | 147 | | value: tokensPerToolCall, |
| | 9 | 148 | | reason: toolCallCount == 0 |
| | 9 | 149 | | ? "No tool calls to compute per-call cost." |
| | 9 | 150 | | : $"{tokensPerToolCall:N0} tokens per tool call ({totalTokens:N0} tokens / {toolCallCount} calls).") |
| | 9 | 151 | | |
| | 9 | 152 | | new NumericMetric( |
| | 9 | 153 | | CacheHitRatioMetricName, |
| | 9 | 154 | | value: cacheHitRatio, |
| | 9 | 155 | | reason: inputTokens == 0 |
| | 9 | 156 | | ? "No input tokens to compute cache ratio." |
| | 9 | 157 | | : $"{cacheHitRatio:P1} of input tokens were cache hits ({cachedInputTokens:N0} of {inputTokens:N0}). |
| | 9 | 158 | | }; |
| | | 159 | | |
| | 9 | 160 | | if (_tokenBudget.HasValue) |
| | | 161 | | { |
| | 3 | 162 | | var underBudget = totalTokens < _tokenBudget.Value; |
| | 3 | 163 | | metrics.Add(new BooleanMetric( |
| | 3 | 164 | | UnderBudgetMetricName, |
| | 3 | 165 | | value: underBudget, |
| | 3 | 166 | | reason: underBudget |
| | 3 | 167 | | ? $"Token usage ({totalTokens:N0}) is under the budget of {_tokenBudget.Value:N0}." |
| | 3 | 168 | | : $"Token usage ({totalTokens:N0}) reached or exceeded the budget of {_tokenBudget.Value:N0}.")); |
| | | 169 | | } |
| | | 170 | | |
| | 9 | 171 | | return new ValueTask<EvaluationResult>(new EvaluationResult(metrics.ToArray())); |
| | | 172 | | } |
| | | 173 | | } |