fix accuracy tests

nirinchev · nirinchev · commit e49dd1ebc8ad · 2025-10-14T13:33:18.000+02:00
diff --git a/scripts/accuracy/generateTestSummary.ts b/scripts/accuracy/generateTestSummary.ts
@@ -73,7 +73,8 @@ function formatToolCallsWithTooltip(toolCalls: ExpectedToolCall[] | LLMToolCall[
     return toolCalls
         .map((call) => {
             const params = JSON.stringify(call.parameters, null, 2);
-            return `<span class="tool-call" title="${params.replace(/"/g, "&quot;")}">${call.toolName}</span>`;
+            const isOptional = "optional" in call && call.optional;
+            return `<span class="tool-call" title="${params.replace(/"/g, "&quot;")}">${isOptional ? "(" : ""}${call.toolName}${isOptional ? ")" : ""}</span>`;
         })
         .join(", ");
 }
diff --git a/tests/accuracy/createCollection.test.ts b/tests/accuracy/createCollection.test.ts
@@ -28,6 +28,11 @@ describeAccuracyTests([
     {
         prompt: "If and only if, the namespace 'mflix.documentaries' does not exist, then create it",
         expectedToolCalls: [
+            {
+                toolName: "list-databases",
+                parameters: {},
+                optional: true,
+            },
             {
                 toolName: "list-collections",
                 parameters: {
diff --git a/tests/accuracy/dropCollection.test.ts b/tests/accuracy/dropCollection.test.ts
@@ -4,6 +4,18 @@ describeAccuracyTests([
     {
         prompt: "Remove mflix.movies namespace from my cluster.",
         expectedToolCalls: [
+            {
+                toolName: "list-databases",
+                parameters: {},
+                optional: true,
+            },
+            {
+                toolName: "list-collections",
+                parameters: {
+                    database: "mflix",
+                },
+                optional: true,
+            },
             {
                 toolName: "drop-collection",
                 parameters: {
diff --git a/tests/accuracy/find.test.ts b/tests/accuracy/find.test.ts
@@ -1,3 +1,4 @@
+import { jsonExportFormat } from "../../src/common/exportsManager.js";
 import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js";
 import { Matcher } from "./sdk/matcher.js";
 
@@ -124,6 +125,7 @@ describeAccuracyTests([
                     limit: Matcher.anyValue,
                     sort: Matcher.anyValue,
                 },
+                optional: true,
             },
             {
                 toolName: "export",
@@ -137,14 +139,19 @@ describeAccuracyTests([
                             arguments: Matcher.anyOf(
                                 Matcher.emptyObjectOrUndefined,
                                 Matcher.value({
-                                    filter: Matcher.anyValue,
+                                    filter: Matcher.emptyObjectOrUndefined,
                                     projection: Matcher.anyValue,
                                     limit: Matcher.anyValue,
                                     sort: Matcher.anyValue,
                                 })
                             ),
                         },
                     ],
+                    jsonExportFormat: Matcher.anyOf(
+                        Matcher.undefined,
+                        Matcher.value("relaxed"),
+                        Matcher.value("canonical")
+                    ),
                 },
             },
         ],
diff --git a/tests/accuracy/getPerformanceAdvisor.test.ts b/tests/accuracy/getPerformanceAdvisor.test.ts
@@ -35,21 +35,27 @@ const mockedTools = {
     },
 };
 
+const listProjectsAndClustersToolCalls = [
+    {
+        toolName: "atlas-list-projects",
+        parameters: {},
+        optional: true,
+    },
+    {
+        toolName: "atlas-list-clusters",
+        parameters: {
+            projectId: "mflix",
+        },
+        optional: true,
+    },
+];
+
 describeAccuracyTests([
     // Test for Suggested Indexes operation
     {
         prompt: "Can you give me index suggestions for the database 'mflix' in the project 'mflix' and cluster 'mflix-cluster'?",
         expectedToolCalls: [
-            {
-                toolName: "atlas-list-projects",
-                parameters: {},
-            },
-            {
-                toolName: "atlas-list-clusters",
-                parameters: {
-                    projectId: "mflix",
-                },
-            },
+            ...listProjectsAndClustersToolCalls,
             {
                 toolName: "atlas-get-performance-advisor",
                 parameters: {
@@ -65,16 +71,7 @@ describeAccuracyTests([
     {
         prompt: "Show me drop index suggestions for the 'mflix' project and 'mflix-cluster' cluster",
         expectedToolCalls: [
-            {
-                toolName: "atlas-list-projects",
-                parameters: {},
-            },
-            {
-                toolName: "atlas-list-clusters",
-                parameters: {
-                    projectId: "mflix",
-                },
-            },
+            ...listProjectsAndClustersToolCalls,
             {
                 toolName: "atlas-get-performance-advisor",
                 parameters: {
@@ -88,18 +85,9 @@ describeAccuracyTests([
     },
     // Test for Slow Query Logs operation
     {
-        prompt: "Show me the slow query logs for the 'mflix' project and 'mflix-cluster' cluster for the namespaces 'mflix.movies' and 'mflix.shows' since January 1st, 2025.",
+        prompt: "Show me the slow query logs for the 'mflix' project and 'mflix-cluster' cluster for the namespaces 'mflix.movies' and 'mflix.shows' since January 1st, 2025 (a date that is certainly in the past!).",
         expectedToolCalls: [
-            {
-                toolName: "atlas-list-projects",
-                parameters: {},
-            },
-            {
-                toolName: "atlas-list-clusters",
-                parameters: {
-                    projectId: "mflix",
-                },
-            },
+            ...listProjectsAndClustersToolCalls,
             {
                 toolName: "atlas-get-performance-advisor",
                 parameters: {
@@ -117,16 +105,7 @@ describeAccuracyTests([
     {
         prompt: "Give me schema suggestions for the 'mflix' project and 'mflix-cluster' cluster",
         expectedToolCalls: [
-            {
-                toolName: "atlas-list-projects",
-                parameters: {},
-            },
-            {
-                toolName: "atlas-list-clusters",
-                parameters: {
-                    projectId: "mflix",
-                },
-            },
+            ...listProjectsAndClustersToolCalls,
             {
                 toolName: "atlas-get-performance-advisor",
                 parameters: {
@@ -142,16 +121,7 @@ describeAccuracyTests([
     {
         prompt: "Show me all performance advisor recommendations for the 'mflix' project and 'mflix-cluster' cluster",
         expectedToolCalls: [
-            {
-                toolName: "atlas-list-projects",
-                parameters: {},
-            },
-            {
-                toolName: "atlas-list-clusters",
-                parameters: {
-                    projectId: "mflix",
-                },
-            },
+            ...listProjectsAndClustersToolCalls,
             {
                 toolName: "atlas-get-performance-advisor",
                 parameters: {
diff --git a/tests/accuracy/sdk/accuracyResultStorage/resultStorage.ts b/tests/accuracy/sdk/accuracyResultStorage/resultStorage.ts
@@ -4,7 +4,9 @@ export interface LLMToolCall {
     parameters: Record<string, unknown>;
 }
 
-export type ExpectedToolCall = Omit<LLMToolCall, "toolCallId">;
+export type ExpectedToolCall = Omit<LLMToolCall, "toolCallId"> & {
+    optional?: boolean;
+};
 
 export const AccuracyRunStatus = {
     Done: "done",
diff --git a/tests/accuracy/sdk/accuracyScorer.ts b/tests/accuracy/sdk/accuracyScorer.ts
@@ -81,12 +81,15 @@ export function calculateToolCallingAccuracy(
             .sort((a, b) => b.score - a.score || a.index - b.index);
 
         const bestMatch = candidates[0];
-        if (!bestMatch || bestMatch.score === 0) {
-            return 0; // No matching tool call found, return 0
+        if (bestMatch) {
+            checkedActualToolCallIndexes.add(bestMatch.index);
+            currentScore = Math.min(currentScore, bestMatch.score);
+        } else if (expectedCall.optional) {
+            // Optional expected tool call not found, but it's okay, continue
+            continue;
+        } else {
+            return 0; // Required expected tool call not found, return 0
         }
-
-        checkedActualToolCallIndexes.add(bestMatch.index);
-        currentScore = Math.min(currentScore, bestMatch.score);
     }
 
     return currentScore;

Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,11 @@ describeAccuracyTests([`
`28`	`28`	`{`
`29`	`29`	`prompt: "If and only if, the namespace 'mflix.documentaries' does not exist, then create it",`
`30`	`30`	`expectedToolCalls: [`
	`31`	`+ {`
	`32`	`+ toolName: "list-databases",`
	`33`	`+ parameters: {},`
	`34`	`+ optional: true,`
	`35`	`+ },`
`31`	`36`	`{`
`32`	`37`	`toolName: "list-collections",`
`33`	`38`	`parameters: {`
Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,9 @@ export interface LLMToolCall {`
`4`	`4`	`parameters: Record<string, unknown>;`
`5`	`5`	`}`
`6`	`6`
`7`		`-export type ExpectedToolCall = Omit<LLMToolCall, "toolCallId">;`
	`7`	`+export type ExpectedToolCall = Omit<LLMToolCall, "toolCallId"> & {`
	`8`	`+ optional?: boolean;`
	`9`	`+};`
`8`	`10`
`9`	`11`	`export const AccuracyRunStatus = {`
`10`	`12`	`Done: "done",`