< Summary

Information

Class:	NexusLabs.Needlr.AgentFramework.Evaluation.ToolCallTrajectoryEvaluator
Assembly:	NexusLabs.Needlr.AgentFramework.Evaluation
File(s):	/home/runner/work/needlr/needlr/src/NexusLabs.Needlr.AgentFramework.Evaluation/ToolCallTrajectoryEvaluator.cs

Line coverage

99%

Covered lines:	130
Uncovered lines:	1
Coverable lines:	131
Total lines:	281
Line coverage:	99.2%

Branch coverage

98%

Covered branches:	51
Total branches:	52
Branch coverage:	98%

Method coverage

Feature is only available for sponsors

Upgrade to PRO version

Metrics

Method	Branch coverage	Crap Score	Cyclomatic complexity	Line coverage
get_EvaluationMetricNames()	100%	2	1	0%
.ctor()	100%	1	1	100%
EvaluateAsync(...)	96.15%	26	26	100%
CountSequenceGaps(...)	100%	8	8	100%
CountConsecutiveSameTool(...)	100%	6	6	100%
BuildPerToolFailureRate(...)	100%	8	8	100%
ComputeLatencyPercentiles(...)	100%	4	4	100%
NearestRankPercentile(...)	100%	1	1	100%

File(s)

/home/runner/work/needlr/needlr/src/NexusLabs.Needlr.AgentFramework.Evaluation/ToolCallTrajectoryEvaluator.cs

#	Line	Line coverage
	`1`	`using System.Text.Json;`
	`2`
	`3`	`using Microsoft.Extensions.AI;`
	`4`	`using Microsoft.Extensions.AI.Evaluation;`
	`5`
	`6`	`using NexusLabs.Needlr.AgentFramework.Diagnostics;`
	`7`
	`8`	`namespace NexusLabs.Needlr.AgentFramework.Evaluation;`
	`9`
	`10`	`/// <summary>`
	`11`	`/// Deterministic evaluator that scores the tool-call trajectory of an agent run from`
	`12`	`/// the captured <see cref="IAgentRunDiagnostics"/> snapshot carried in an`
	`13`	`/// <see cref="AgentRunDiagnosticsContext"/>.`
	`14`	`/// </summary>`
	`15`	`/// <remarks>`
	`16`	`/// <para>`
	`17`	`/// This evaluator never contacts a language model. It reads the ordered`
	`18`	`/// <see cref="IAgentRunDiagnostics.ToolCalls"/> collection and produces:`
	`19`	`/// </para>`
	`20`	`/// <list type="bullet">`
	`21`	`/// <item><description><c>Tool Calls Total</c> — total number of tool invocations.</description></item>`
	`22`	`/// <item><description><c>Tool Calls Failed</c> — count of tool invocations whose <see cref="ToolCallDiagnostics.Succe`
	`23`	`/// <item><description><c>Tool Call Sequence Gaps</c> — number of missing slots in the <see cref="ToolCallDiagnostics.`
	`24`	`/// <item><description><c>All Tool Calls Succeeded</c> — boolean rollup. <see langword="true"/> when every tool invoca`
	`25`	`/// <item><description><c>Consecutive Same-Tool Calls</c> — count of consecutive tool invocations with the same <see c`
	`26`	`/// <item><description><c>Per-Tool Failure Rate</c> — JSON string mapping each tool name to its failure rate (0.0–1.0)`
	`27`	`/// <item><description><c>Tool Call Latency P50</c> — 50th percentile of tool call durations in milliseconds (nearest-`
	`28`	`/// <item><description><c>Tool Call Latency P95</c> — 95th percentile of tool call durations in milliseconds (nearest-`
	`29`	`/// </list>`
	`30`	`/// <para>`
	`31`	`/// When no <see cref="AgentRunDiagnosticsContext"/> is present in the`
	`32`	`/// <c>additionalContext</c> collection, the evaluator returns an empty`
	`33`	`/// <see cref="EvaluationResult"/> — callers should treat that as "not applicable".`
	`34`	`/// </para>`
	`35`	`/// </remarks>`
	`36`	`public sealed class ToolCallTrajectoryEvaluator : IEvaluator`
	`37`	`{`
	`38`	`/// <summary>Metric name for the total tool-call count.</summary>`
	`39`	`public const string TotalMetricName = "Tool Calls Total";`
	`40`
	`41`	`/// <summary>Metric name for the failed tool-call count.</summary>`
	`42`	`public const string FailedMetricName = "Tool Calls Failed";`
	`43`
	`44`	`/// <summary>Metric name for the number of gaps in the recorded tool-call sequence.</summary>`
	`45`	`public const string SequenceGapsMetricName = "Tool Call Sequence Gaps";`
	`46`
	`47`	`/// <summary>Metric name for the boolean rollup indicating every tool call succeeded.</summary>`
	`48`	`public const string AllSucceededMetricName = "All Tool Calls Succeeded";`
	`49`
	`50`	`/// <summary>Metric name for the count of consecutive tool calls with the same tool name.</summary>`
	`51`	`public const string ConsecutiveSameToolMetricName = "Consecutive Same-Tool Calls";`
	`52`
	`53`	`/// <summary>Metric name for the JSON-formatted per-tool failure rate breakdown.</summary>`
	`54`	`public const string PerToolFailureRateMetricName = "Per-Tool Failure Rate";`
	`55`
	`56`	`/// <summary>Metric name for the 50th percentile tool-call latency in milliseconds.</summary>`
	`57`	`public const string LatencyP50MetricName = "Tool Call Latency P50";`
	`58`
	`59`	`/// <summary>Metric name for the 95th percentile tool-call latency in milliseconds.</summary>`
	`60`	`public const string LatencyP95MetricName = "Tool Call Latency P95";`
	`61`
	`62`	`/// <inheritdoc />`
0	`63`	`public IReadOnlyCollection<string> EvaluationMetricNames { get; } =`
16	`64`	`[`
16	`65`	`TotalMetricName,`
16	`66`	`FailedMetricName,`
16	`67`	`SequenceGapsMetricName,`
16	`68`	`AllSucceededMetricName,`
16	`69`	`ConsecutiveSameToolMetricName,`
16	`70`	`PerToolFailureRateMetricName,`
16	`71`	`LatencyP50MetricName,`
16	`72`	`LatencyP95MetricName,`
16	`73`	`];`
	`74`
	`75`	`/// <inheritdoc />`
	`76`	`public ValueTask<EvaluationResult> EvaluateAsync(`
	`77`	`IEnumerable<ChatMessage> messages,`
	`78`	`ChatResponse modelResponse,`
	`79`	`ChatConfiguration? chatConfiguration = null,`
	`80`	`IEnumerable<EvaluationContext>? additionalContext = null,`
	`81`	`CancellationToken cancellationToken = default)`
	`82`	`{`
16	`83`	`var diagnostics = additionalContext?`
16	`84`	`.OfType<AgentRunDiagnosticsContext>()`
16	`85`	`.FirstOrDefault()?`
16	`86`	`.Diagnostics;`
	`87`
16	`88`	`if (diagnostics is null)`
	`89`	`{`
1	`90`	`return new ValueTask<EvaluationResult>(new EvaluationResult());`
	`91`	`}`
	`92`
15	`93`	`var toolCalls = diagnostics.ToolCalls;`
15	`94`	`var total = toolCalls.Count;`
15	`95`	`var failed = 0;`
96	`96`	`for (var i = 0; i < toolCalls.Count; i++)`
	`97`	`{`
33	`98`	`if (!toolCalls[i].Succeeded)`
	`99`	`{`
3	`100`	`failed++;`
	`101`	`}`
	`102`	`}`
	`103`
15	`104`	`var gaps = CountSequenceGaps(toolCalls);`
15	`105`	`var allSucceeded = failed == 0;`
15	`106`	`var consecutiveSameTool = CountConsecutiveSameTool(toolCalls);`
15	`107`	`var perToolFailureRate = BuildPerToolFailureRate(toolCalls);`
15	`108`	`var (p50, p95) = ComputeLatencyPercentiles(toolCalls);`
	`109`
15	`110`	`var totalMetric = new NumericMetric(`
15	`111`	`TotalMetricName,`
15	`112`	`value: total,`
15	`113`	`reason: total == 0`
15	`114`	`? "No tool calls were recorded for this agent run."`
15	`115`	`: $"{total} tool call(s) were recorded.");`
	`116`
15	`117`	`var failedMetric = new NumericMetric(`
15	`118`	`FailedMetricName,`
15	`119`	`value: failed,`
15	`120`	`reason: failed == 0`
15	`121`	`? "All recorded tool calls succeeded."`
15	`122`	`: $"{failed} of {total} recorded tool call(s) failed.");`
	`123`
15	`124`	`var gapsMetric = new NumericMetric(`
15	`125`	`SequenceGapsMetricName,`
15	`126`	`value: gaps,`
15	`127`	`reason: gaps == 0`
15	`128`	`? "The tool-call sequence is contiguous starting at 0."`
15	`129`	`: $"{gaps} gap(s) detected in the tool-call sequence.");`
	`130`
15	`131`	`var allSucceededMetric = new BooleanMetric(`
15	`132`	`AllSucceededMetricName,`
15	`133`	`value: allSucceeded,`
15	`134`	`reason: allSucceeded`
15	`135`	`? "Every recorded tool call reported success."`
15	`136`	`: "At least one recorded tool call reported failure.");`
	`137`
15	`138`	`var consecutiveMetric = new NumericMetric(`
15	`139`	`ConsecutiveSameToolMetricName,`
15	`140`	`value: consecutiveSameTool,`
15	`141`	`reason: consecutiveSameTool == 0`
15	`142`	`? "No consecutive same-tool calls detected."`
15	`143`	`: $"{consecutiveSameTool} consecutive same-tool call(s) detected (heuristic — may include valid parallel`
	`144`
15	`145`	`var failureRateMetric = new StringMetric(`
15	`146`	`PerToolFailureRateMetricName,`
15	`147`	`value: perToolFailureRate,`
15	`148`	`reason: total == 0`
15	`149`	`? "No tool calls to compute failure rates."`
15	`150`	`: "Per-tool failure rates as JSON (tool name → failure rate 0.0–1.0).");`
	`151`
15	`152`	`var p50Metric = new NumericMetric(`
15	`153`	`LatencyP50MetricName,`
15	`154`	`value: p50,`
15	`155`	`reason: total == 0`
15	`156`	`? "No tool calls to compute latency."`
15	`157`	`: $"50th percentile tool-call latency: {p50:F1}ms.");`
	`158`
15	`159`	`var p95Metric = new NumericMetric(`
15	`160`	`LatencyP95MetricName,`
15	`161`	`value: p95,`
15	`162`	`reason: total == 0`
15	`163`	`? "No tool calls to compute latency."`
15	`164`	`: $"95th percentile tool-call latency: {p95:F1}ms.");`
	`165`
15	`166`	`return new ValueTask<EvaluationResult>(new EvaluationResult(`
15	`167`	`totalMetric,`
15	`168`	`failedMetric,`
15	`169`	`gapsMetric,`
15	`170`	`allSucceededMetric,`
15	`171`	`consecutiveMetric,`
15	`172`	`failureRateMetric,`
15	`173`	`p50Metric,`
15	`174`	`p95Metric));`
	`175`	`}`
	`176`
	`177`	`private static int CountSequenceGaps(IReadOnlyList<ToolCallDiagnostics> toolCalls)`
	`178`	`{`
15	`179`	`if (toolCalls.Count == 0)`
	`180`	`{`
4	`181`	`return 0;`
	`182`	`}`
	`183`
11	`184`	`var sequences = new int[toolCalls.Count];`
88	`185`	`for (var i = 0; i < toolCalls.Count; i++)`
	`186`	`{`
33	`187`	`sequences[i] = toolCalls[i].Sequence;`
	`188`	`}`
11	`189`	`Array.Sort(sequences);`
	`190`
11	`191`	`var gaps = 0;`
11	`192`	`var expected = sequences[0];`
88	`193`	`for (var i = 0; i < sequences.Length; i++)`
	`194`	`{`
33	`195`	`var actual = sequences[i];`
33	`196`	`if (actual > expected)`
	`197`	`{`
2	`198`	`gaps += actual - expected;`
	`199`	`}`
33	`200`	`expected = actual + 1;`
	`201`	`}`
	`202`
11	`203`	`return gaps;`
	`204`	`}`
	`205`
	`206`	`private static int CountConsecutiveSameTool(IReadOnlyList<ToolCallDiagnostics> toolCalls)`
	`207`	`{`
15	`208`	`if (toolCalls.Count <= 1)`
	`209`	`{`
5	`210`	`return 0;`
	`211`	`}`
	`212`
10	`213`	`var count = 0;`
64	`214`	`for (var i = 1; i < toolCalls.Count; i++)`
	`215`	`{`
22	`216`	`if (string.Equals(toolCalls[i].ToolName, toolCalls[i - 1].ToolName, StringComparison.Ordinal))`
	`217`	`{`
4	`218`	`count++;`
	`219`	`}`
	`220`	`}`
	`221`
10	`222`	`return count;`
	`223`	`}`
	`224`
	`225`	`private static string BuildPerToolFailureRate(IReadOnlyList<ToolCallDiagnostics> toolCalls)`
	`226`	`{`
15	`227`	`if (toolCalls.Count == 0)`
	`228`	`{`
4	`229`	`return "{}";`
	`230`	`}`
	`231`
11	`232`	`var totals = new SortedDictionary<string, int>(StringComparer.Ordinal);`
11	`233`	`var failures = new SortedDictionary<string, int>(StringComparer.Ordinal);`
	`234`
88	`235`	`for (var i = 0; i < toolCalls.Count; i++)`
	`236`	`{`
33	`237`	`var name = toolCalls[i].ToolName;`
33	`238`	`totals.TryGetValue(name, out var t);`
33	`239`	`totals[name] = t + 1;`
	`240`
33	`241`	`if (!toolCalls[i].Succeeded)`
	`242`	`{`
3	`243`	`failures.TryGetValue(name, out var f);`
3	`244`	`failures[name] = f + 1;`
	`245`	`}`
	`246`	`}`
	`247`
11	`248`	`var rates = new SortedDictionary<string, double>(StringComparer.Ordinal);`
78	`249`	`foreach (var kvp in totals)`
	`250`	`{`
28	`251`	`failures.TryGetValue(kvp.Key, out var f);`
28	`252`	`rates[kvp.Key] = (double)f / kvp.Value;`
	`253`	`}`
	`254`
11	`255`	`return JsonSerializer.Serialize(rates);`
	`256`	`}`
	`257`
	`258`	`private static (double P50, double P95) ComputeLatencyPercentiles(`
	`259`	`IReadOnlyList<ToolCallDiagnostics> toolCalls)`
	`260`	`{`
15	`261`	`if (toolCalls.Count == 0)`
	`262`	`{`
4	`263`	`return (0, 0);`
	`264`	`}`
	`265`
11	`266`	`var durations = new double[toolCalls.Count];`
88	`267`	`for (var i = 0; i < toolCalls.Count; i++)`
	`268`	`{`
33	`269`	`durations[i] = toolCalls[i].Duration.TotalMilliseconds;`
	`270`	`}`
11	`271`	`Array.Sort(durations);`
	`272`
11	`273`	`return (NearestRankPercentile(durations, 50), NearestRankPercentile(durations, 95));`
	`274`	`}`
	`275`
	`276`	`private static double NearestRankPercentile(double[] sorted, int percentile)`
	`277`	`{`
22	`278`	`var index = (int)Math.Ceiling(percentile / 100.0 * sorted.Length) - 1;`
22	`279`	`return sorted[Math.Clamp(index, 0, sorted.Length - 1)];`
	`280`	`}`
	`281`	`}`

< Summary

Metrics

File(s)

/home/runner/work/needlr/needlr/src/NexusLabs.Needlr.AgentFramework.Evaluation/ToolCallTrajectoryEvaluator.cs

Methods/Properties