Skip to content

Commit e49dd1e

Browse files
committed
fix accuracy tests
1 parent 2d3b74e commit e49dd1e

File tree

7 files changed

+59
-59
lines changed

7 files changed

+59
-59
lines changed

scripts/accuracy/generateTestSummary.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,8 @@ function formatToolCallsWithTooltip(toolCalls: ExpectedToolCall[] | LLMToolCall[
7373
return toolCalls
7474
.map((call) => {
7575
const params = JSON.stringify(call.parameters, null, 2);
76-
return `<span class="tool-call" title="${params.replace(/"/g, "&quot;")}">${call.toolName}</span>`;
76+
const isOptional = "optional" in call && call.optional;
77+
return `<span class="tool-call" title="${params.replace(/"/g, "&quot;")}">${isOptional ? "(" : ""}${call.toolName}${isOptional ? ")" : ""}</span>`;
7778
})
7879
.join(", ");
7980
}

tests/accuracy/createCollection.test.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@ describeAccuracyTests([
2828
{
2929
prompt: "If and only if, the namespace 'mflix.documentaries' does not exist, then create it",
3030
expectedToolCalls: [
31+
{
32+
toolName: "list-databases",
33+
parameters: {},
34+
optional: true,
35+
},
3136
{
3237
toolName: "list-collections",
3338
parameters: {

tests/accuracy/dropCollection.test.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,18 @@ describeAccuracyTests([
44
{
55
prompt: "Remove mflix.movies namespace from my cluster.",
66
expectedToolCalls: [
7+
{
8+
toolName: "list-databases",
9+
parameters: {},
10+
optional: true,
11+
},
12+
{
13+
toolName: "list-collections",
14+
parameters: {
15+
database: "mflix",
16+
},
17+
optional: true,
18+
},
719
{
820
toolName: "drop-collection",
921
parameters: {

tests/accuracy/find.test.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import { jsonExportFormat } from "../../src/common/exportsManager.js";
12
import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js";
23
import { Matcher } from "./sdk/matcher.js";
34

@@ -124,6 +125,7 @@ describeAccuracyTests([
124125
limit: Matcher.anyValue,
125126
sort: Matcher.anyValue,
126127
},
128+
optional: true,
127129
},
128130
{
129131
toolName: "export",
@@ -137,14 +139,19 @@ describeAccuracyTests([
137139
arguments: Matcher.anyOf(
138140
Matcher.emptyObjectOrUndefined,
139141
Matcher.value({
140-
filter: Matcher.anyValue,
142+
filter: Matcher.emptyObjectOrUndefined,
141143
projection: Matcher.anyValue,
142144
limit: Matcher.anyValue,
143145
sort: Matcher.anyValue,
144146
})
145147
),
146148
},
147149
],
150+
jsonExportFormat: Matcher.anyOf(
151+
Matcher.undefined,
152+
Matcher.value("relaxed"),
153+
Matcher.value("canonical")
154+
),
148155
},
149156
},
150157
],

tests/accuracy/getPerformanceAdvisor.test.ts

Lines changed: 21 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -35,21 +35,27 @@ const mockedTools = {
3535
},
3636
};
3737

38+
const listProjectsAndClustersToolCalls = [
39+
{
40+
toolName: "atlas-list-projects",
41+
parameters: {},
42+
optional: true,
43+
},
44+
{
45+
toolName: "atlas-list-clusters",
46+
parameters: {
47+
projectId: "mflix",
48+
},
49+
optional: true,
50+
},
51+
];
52+
3853
describeAccuracyTests([
3954
// Test for Suggested Indexes operation
4055
{
4156
prompt: "Can you give me index suggestions for the database 'mflix' in the project 'mflix' and cluster 'mflix-cluster'?",
4257
expectedToolCalls: [
43-
{
44-
toolName: "atlas-list-projects",
45-
parameters: {},
46-
},
47-
{
48-
toolName: "atlas-list-clusters",
49-
parameters: {
50-
projectId: "mflix",
51-
},
52-
},
58+
...listProjectsAndClustersToolCalls,
5359
{
5460
toolName: "atlas-get-performance-advisor",
5561
parameters: {
@@ -65,16 +71,7 @@ describeAccuracyTests([
6571
{
6672
prompt: "Show me drop index suggestions for the 'mflix' project and 'mflix-cluster' cluster",
6773
expectedToolCalls: [
68-
{
69-
toolName: "atlas-list-projects",
70-
parameters: {},
71-
},
72-
{
73-
toolName: "atlas-list-clusters",
74-
parameters: {
75-
projectId: "mflix",
76-
},
77-
},
74+
...listProjectsAndClustersToolCalls,
7875
{
7976
toolName: "atlas-get-performance-advisor",
8077
parameters: {
@@ -88,18 +85,9 @@ describeAccuracyTests([
8885
},
8986
// Test for Slow Query Logs operation
9087
{
91-
prompt: "Show me the slow query logs for the 'mflix' project and 'mflix-cluster' cluster for the namespaces 'mflix.movies' and 'mflix.shows' since January 1st, 2025.",
88+
prompt: "Show me the slow query logs for the 'mflix' project and 'mflix-cluster' cluster for the namespaces 'mflix.movies' and 'mflix.shows' since January 1st, 2025 (a date that is certainly in the past!).",
9289
expectedToolCalls: [
93-
{
94-
toolName: "atlas-list-projects",
95-
parameters: {},
96-
},
97-
{
98-
toolName: "atlas-list-clusters",
99-
parameters: {
100-
projectId: "mflix",
101-
},
102-
},
90+
...listProjectsAndClustersToolCalls,
10391
{
10492
toolName: "atlas-get-performance-advisor",
10593
parameters: {
@@ -117,16 +105,7 @@ describeAccuracyTests([
117105
{
118106
prompt: "Give me schema suggestions for the 'mflix' project and 'mflix-cluster' cluster",
119107
expectedToolCalls: [
120-
{
121-
toolName: "atlas-list-projects",
122-
parameters: {},
123-
},
124-
{
125-
toolName: "atlas-list-clusters",
126-
parameters: {
127-
projectId: "mflix",
128-
},
129-
},
108+
...listProjectsAndClustersToolCalls,
130109
{
131110
toolName: "atlas-get-performance-advisor",
132111
parameters: {
@@ -142,16 +121,7 @@ describeAccuracyTests([
142121
{
143122
prompt: "Show me all performance advisor recommendations for the 'mflix' project and 'mflix-cluster' cluster",
144123
expectedToolCalls: [
145-
{
146-
toolName: "atlas-list-projects",
147-
parameters: {},
148-
},
149-
{
150-
toolName: "atlas-list-clusters",
151-
parameters: {
152-
projectId: "mflix",
153-
},
154-
},
124+
...listProjectsAndClustersToolCalls,
155125
{
156126
toolName: "atlas-get-performance-advisor",
157127
parameters: {

tests/accuracy/sdk/accuracyResultStorage/resultStorage.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@ export interface LLMToolCall {
44
parameters: Record<string, unknown>;
55
}
66

7-
export type ExpectedToolCall = Omit<LLMToolCall, "toolCallId">;
7+
export type ExpectedToolCall = Omit<LLMToolCall, "toolCallId"> & {
8+
optional?: boolean;
9+
};
810

911
export const AccuracyRunStatus = {
1012
Done: "done",

tests/accuracy/sdk/accuracyScorer.ts

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -81,12 +81,15 @@ export function calculateToolCallingAccuracy(
8181
.sort((a, b) => b.score - a.score || a.index - b.index);
8282

8383
const bestMatch = candidates[0];
84-
if (!bestMatch || bestMatch.score === 0) {
85-
return 0; // No matching tool call found, return 0
84+
if (bestMatch) {
85+
checkedActualToolCallIndexes.add(bestMatch.index);
86+
currentScore = Math.min(currentScore, bestMatch.score);
87+
} else if (expectedCall.optional) {
88+
// Optional expected tool call not found, but it's okay, continue
89+
continue;
90+
} else {
91+
return 0; // Required expected tool call not found, return 0
8692
}
87-
88-
checkedActualToolCallIndexes.add(bestMatch.index);
89-
currentScore = Math.min(currentScore, bestMatch.score);
9093
}
9194

9295
return currentScore;

0 commit comments

Comments
 (0)