From defe019bc8da44909e9d490f740a99be2436648a Mon Sep 17 00:00:00 2001 From: Samuel Bushi Date: Wed, 23 Jul 2025 11:43:04 -0400 Subject: [PATCH 1/4] feat(js/genkit-tools/evals): Support (executable) prompt evals --- genkit-tools/common/src/eval/evaluate.ts | 167 ++++++++++++++++++++--- genkit-tools/common/src/types/eval.ts | 7 +- genkit-tools/common/src/utils/eval.ts | 12 +- js/testapps/evals/prompts/hello.prompt | 12 ++ js/testapps/evals/src/genkit.ts | 24 ---- 5 files changed, 177 insertions(+), 45 deletions(-) create mode 100644 js/testapps/evals/prompts/hello.prompt diff --git a/genkit-tools/common/src/eval/evaluate.ts b/genkit-tools/common/src/eval/evaluate.ts index 65292e1f2b..4cc022f3d9 100644 --- a/genkit-tools/common/src/eval/evaluate.ts +++ b/genkit-tools/common/src/eval/evaluate.ts @@ -19,6 +19,8 @@ import { getDatasetStore, getEvalStore } from '.'; import type { RuntimeManager } from '../manager/manager'; import { DatasetSchema, + GenerateActionOptions, + GenerateActionOptionsSchema, GenerateResponseSchema, type Action, type CandidateData, @@ -33,6 +35,7 @@ import { import { evaluatorName, generateTestCaseId, + getAction, getEvalExtractors, getModelInput, hasAction, @@ -50,7 +53,7 @@ interface InferenceRunState { testCaseId: string; input: any; reference?: any; - traceId?: string; + traceIds: string[]; response?: any; evalError?: string; } @@ -61,8 +64,8 @@ interface FullInferenceSample { reference?: any; } -const SUPPORTED_ACTION_TYPES = ['flow', 'model'] as const; - +const SUPPORTED_ACTION_TYPES = ['flow', 'model', 'executable-prompt'] as const; +type SupportedActionType = (typeof SUPPORTED_ACTION_TYPES)[number]; /** * Starts a new evaluation run. Intended to be used via the reflection API. */ @@ -253,7 +256,7 @@ async function bulkRunAction(params: { }): Promise { const { manager, actionRef, inferenceDataset, context, actionConfig } = params; - const isModelAction = actionRef.startsWith('/model'); + const actionType = getSupportedActionType(actionRef); if (inferenceDataset.length === 0) { throw new Error('Cannot run inference, no data provided'); } @@ -267,7 +270,7 @@ async function bulkRunAction(params: { const evalInputs: EvalInput[] = []; for (const sample of fullInferenceDataset) { logger.info(`Running inference '${actionRef}' ...`); - if (isModelAction) { + if (actionType === 'model') { states.push( await runModelAction({ manager, @@ -276,7 +279,8 @@ async function bulkRunAction(params: { modelConfig: actionConfig, }) ); - } else { + } + if (actionType === 'flow') { states.push( await runFlowAction({ manager, @@ -285,6 +289,16 @@ async function bulkRunAction(params: { context, }) ); + } else { + states.push( + await runPromptAction({ + manager, + actionRef, + sample, + context, + modelConfig: actionConfig, + }) + ); } } @@ -311,14 +325,16 @@ async function runFlowAction(params: { }); state = { ...sample, - traceId: runActionResponse.telemetry?.traceId, + traceIds: runActionResponse.telemetry?.traceId + ? [runActionResponse.telemetry?.traceId] + : [], response: runActionResponse.result, }; } catch (e: any) { const traceId = e?.data?.details?.traceId; state = { ...sample, - traceId, + traceIds: traceId ? [traceId] : [], evalError: `Error when running inference. Details: ${e?.message ?? e}`, }; } @@ -341,14 +357,90 @@ async function runModelAction(params: { }); state = { ...sample, + traceIds: runActionResponse.telemetry?.traceId + ? [runActionResponse.telemetry?.traceId] + : [], + response: runActionResponse.result, + }; + } catch (e: any) { + const traceId = e?.data?.details?.traceId; + state = { + ...sample, + traceIds: traceId ? [traceId] : [], + evalError: `Error when running inference. Details: ${e?.message ?? e}`, + }; + } + return state; +} + +async function runPromptAction(params: { + manager: RuntimeManager; + actionRef: string; + sample: FullInferenceSample; + context?: any; + modelConfig?: any; +}): Promise { + const { manager, actionRef, sample, context, modelConfig } = { ...params }; + + const { model: modelFromConfig, ...restOfConfig } = modelConfig ?? {}; + const model = await resolveModel({ manager, actionRef, modelFromConfig }); + if (!model) { + throw new Error( + 'Could not resolve model. Please provide model in the prompt file or through the `actionConfig.model` field.' + ); + } + let state: InferenceRunState; + let renderedPrompt: { + result: GenerateActionOptions; + traceId?: string; + }; + // Step 1. Render the prompt with inputs + try { + const runActionResponse = await manager.runAction({ + key: actionRef, + input: sample.input, + context: context ? JSON.parse(context) : undefined, + }); + + renderedPrompt = { traceId: runActionResponse.telemetry?.traceId, + result: runActionResponse.result as GenerateActionOptions, + }; + } catch (e: any) { + const traceId = e?.data?.details?.traceId; + state = { + ...sample, + traceIds: traceId ? [traceId] : [], + evalError: `Error when rendering prompt. Details: ${e?.message ?? e}`, + }; + return state; + } + // Step 2. Run rendered prompt on the model + try { + let modelInput = GenerateActionOptionsSchema.parse(renderedPrompt.result); + if (restOfConfig) { + modelInput = { ...modelInput, config: restOfConfig }; + } + const runActionResponse = await manager.runAction({ + key: model, + input: modelInput, + }); + const traceIds = runActionResponse.telemetry?.traceId + ? [renderedPrompt.traceId, runActionResponse.telemetry?.traceId] + : [renderedPrompt.traceId]; + state = { + ...sample, + traceIds: traceIds.filter((t): t is string => !!t), response: runActionResponse.result, }; } catch (e: any) { const traceId = e?.data?.details?.traceId; + const traceIds = traceId + ? [renderedPrompt.traceId, traceId] + : [renderedPrompt.traceId]; state = { ...sample, - traceId, + traceIds: traceIds.filter((t): t is string => !!t), evalError: `Error when running inference. Details: ${e?.message ?? e}`, }; } @@ -362,25 +454,31 @@ async function gatherEvalInput(params: { }): Promise { const { manager, actionRef, state } = params; + const actionType = getSupportedActionType(actionRef); const extractors = await getEvalExtractors(actionRef); - const traceId = state.traceId; - if (!traceId) { - logger.warn('No traceId available...'); + const traceIds = state.traceIds; + + if ( + traceIds.length === 0 || + (actionType === 'executable-prompt' && traceIds.length < 2) + ) { + logger.warn('No valid traceId available...'); return { ...state, error: state.evalError, testCaseId: state.testCaseId, - traceIds: [], + traceIds: traceIds, }; } + // Only the last collected trace to be used for evaluation. + const traceId = traceIds.at(-1)!; const trace = await manager.getTrace({ traceId, }); - const isModelAction = actionRef.startsWith('/model'); // Always use original input for models. - const input = isModelAction ? state.input : extractors.input(trace); + const input = actionType === 'model' ? state.input : extractors.input(trace); const nestedSpan = stackTraceSpans(trace); if (!nestedSpan) { @@ -389,7 +487,7 @@ async function gatherEvalInput(params: { input, error: `Unable to extract any spans from trace ${traceId}`, reference: state.reference, - traceIds: [traceId], + traceIds: traceIds, }; } @@ -400,13 +498,14 @@ async function gatherEvalInput(params: { error: getSpanErrorMessage(nestedSpan) ?? `Unknown error in trace ${traceId}`, reference: state.reference, - traceIds: [traceId], + traceIds: traceIds, }; } const output = extractors.output(trace); const context = extractors.context(trace); - const error = isModelAction ? getErrorFromModelResponse(output) : undefined; + const error = + actionType === 'model' ? getErrorFromModelResponse(output) : undefined; return { // TODO Replace this with unified trace class @@ -416,10 +515,27 @@ async function gatherEvalInput(params: { error, context: Array.isArray(context) ? context : [context], reference: state.reference, - traceIds: [traceId], + traceIds: traceIds, }; } +async function resolveModel(params: { + manager: RuntimeManager; + actionRef: string; + modelFromConfig?: string; +}) { + const { manager, actionRef, modelFromConfig } = { ...params }; + + const actionData = await getAction({ manager, actionRef }); + // Prefer to use modelFromConfig + if (modelFromConfig) { + return modelFromConfig; + } + + const promptMetadata = actionData?.metadata?.prompt as any; + return promptMetadata?.model ? `/model/${promptMetadata?.model}` : undefined; +} + function getSpanErrorMessage(span: SpanData): string | undefined { if (span && span.status?.code === 2 /* SpanStatusCode.ERROR */) { // It's possible for a trace to have multiple exception events, @@ -466,3 +582,16 @@ function isSupportedActionRef(actionRef: string) { actionRef.startsWith(`/${supportedType}`) ); } + +function getSupportedActionType(actionRef: string): SupportedActionType { + if (actionRef.startsWith('/model')) { + return 'model'; + } + if (actionRef.startsWith('/flow')) { + return 'flow'; + } + if (actionRef.startsWith('/executable-prompt')) { + return 'executable-prompt'; + } + throw new Error(`Unsupported action type: ${actionRef}`); +} diff --git a/genkit-tools/common/src/types/eval.ts b/genkit-tools/common/src/types/eval.ts index ac6eddb9dd..9cbdd73f47 100644 --- a/genkit-tools/common/src/types/eval.ts +++ b/genkit-tools/common/src/types/eval.ts @@ -251,7 +251,12 @@ export const DatasetSchemaSchema = z.object({ }); /** Type of dataset, useful for UI niceties. */ -export const DatasetTypeSchema = z.enum(['UNKNOWN', 'FLOW', 'MODEL']); +export const DatasetTypeSchema = z.enum([ + 'UNKNOWN', + 'FLOW', + 'MODEL', + 'EXECUTABLE_PROMPT', +]); export type DatasetType = z.infer; /** diff --git a/genkit-tools/common/src/utils/eval.ts b/genkit-tools/common/src/utils/eval.ts index 765075b818..81243f6c00 100644 --- a/genkit-tools/common/src/utils/eval.ts +++ b/genkit-tools/common/src/utils/eval.ts @@ -336,6 +336,16 @@ export async function hasAction(params: { return actionsRecord.hasOwnProperty(actionRef); } +export async function getAction(params: { + manager: RuntimeManager; + actionRef: string; +}): Promise { + const { manager, actionRef } = { ...params }; + const allActions = await manager.listActions(); + + return Object.values(allActions).find((action) => action.key === actionRef); +} + /** Helper function that maps string data to GenerateRequest */ export function getModelInput(data: any, modelConfig: any): GenerateRequest { let message: MessageData; @@ -355,7 +365,7 @@ export function getModelInput(data: any, modelConfig: any): GenerateRequest { } else { const maybeRequest = GenerateRequestSchema.safeParse(data); if (maybeRequest.success) { - return maybeRequest.data; + return { ...maybeRequest.data, config: modelConfig }; } else { throw new Error( `Unable to parse model input as MessageSchema. Details: ${maybeRequest.error}` diff --git a/js/testapps/evals/prompts/hello.prompt b/js/testapps/evals/prompts/hello.prompt new file mode 100644 index 0000000000..271e898f5d --- /dev/null +++ b/js/testapps/evals/prompts/hello.prompt @@ -0,0 +1,12 @@ +--- +model: googleai/gemini-2.5-flash +config: + temperature: 0.75 +input: + schema: + firstName: string + lastName: string + persona: string +--- + +You are a {{persona}}. Say hello to {{firstName}} {{lastName}}. diff --git a/js/testapps/evals/src/genkit.ts b/js/testapps/evals/src/genkit.ts index f03cf816da..e59f0f13b6 100644 --- a/js/testapps/evals/src/genkit.ts +++ b/js/testapps/evals/src/genkit.ts @@ -22,11 +22,6 @@ import { googleAI, textEmbeddingGecko001, } from '@genkit-ai/googleai'; -import { vertexAI } from '@genkit-ai/vertexai'; -import { - VertexAIEvaluationMetricType, - vertexAIEvaluation, -} from '@genkit-ai/vertexai/evaluation'; import { genkit } from 'genkit'; import { langchain } from 'genkitx-langchain'; @@ -70,25 +65,6 @@ export const ai = genkit({ }, ], }), - vertexAI({ - location: 'us-central1', - }), - vertexAIEvaluation({ - location: 'us-central1', - metrics: [ - VertexAIEvaluationMetricType.BLEU, - VertexAIEvaluationMetricType.GROUNDEDNESS, - VertexAIEvaluationMetricType.SAFETY, - { - type: VertexAIEvaluationMetricType.ROUGE, - metricSpec: { - rougeType: 'rougeLsum', - useStemmer: true, - splitSummaries: 'true', - }, - }, - ], - }), devLocalVectorstore([ { indexName: 'pdfQA', From 79caa7e04d28f9ffa2b9a02aeee884a08a743b9d Mon Sep 17 00:00:00 2001 From: Samuel Bushi Date: Mon, 28 Jul 2025 14:10:50 -0400 Subject: [PATCH 2/4] fix to use original input --- genkit-tools/common/src/eval/evaluate.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/genkit-tools/common/src/eval/evaluate.ts b/genkit-tools/common/src/eval/evaluate.ts index 4cc022f3d9..451f7eb85a 100644 --- a/genkit-tools/common/src/eval/evaluate.ts +++ b/genkit-tools/common/src/eval/evaluate.ts @@ -477,8 +477,8 @@ async function gatherEvalInput(params: { traceId, }); - // Always use original input for models. - const input = actionType === 'model' ? state.input : extractors.input(trace); + // Always use original input for models and prompts. + const input = actionType === 'flow' ? extractors.input(trace) : state.input; const nestedSpan = stackTraceSpans(trace); if (!nestedSpan) { From 71039cdd55ee825eaa199e5a01713c838f76e457 Mon Sep 17 00:00:00 2001 From: Samuel Bushi Date: Fri, 15 Aug 2025 10:23:35 -0400 Subject: [PATCH 3/4] updated to have custom fields --- genkit-tools/common/src/eval/evaluate.ts | 10 ++++++++++ genkit-tools/common/src/types/eval.ts | 1 + js/ai/src/evaluator.ts | 1 + js/testapps/evals/prompts/hello.prompt | 10 ++++++---- 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/genkit-tools/common/src/eval/evaluate.ts b/genkit-tools/common/src/eval/evaluate.ts index 451f7eb85a..7f68d76930 100644 --- a/genkit-tools/common/src/eval/evaluate.ts +++ b/genkit-tools/common/src/eval/evaluate.ts @@ -479,6 +479,13 @@ async function gatherEvalInput(params: { // Always use original input for models and prompts. const input = actionType === 'flow' ? extractors.input(trace) : state.input; + let custom = undefined; + if (actionType === 'executable-prompt') { + const promptTrace = await manager.getTrace({ + traceId: traceIds[0], + }); + custom = { renderedPrompt: extractors.output(promptTrace) }; + } const nestedSpan = stackTraceSpans(trace); if (!nestedSpan) { @@ -487,6 +494,7 @@ async function gatherEvalInput(params: { input, error: `Unable to extract any spans from trace ${traceId}`, reference: state.reference, + custom, traceIds: traceIds, }; } @@ -498,6 +506,7 @@ async function gatherEvalInput(params: { error: getSpanErrorMessage(nestedSpan) ?? `Unknown error in trace ${traceId}`, reference: state.reference, + custom, traceIds: traceIds, }; } @@ -515,6 +524,7 @@ async function gatherEvalInput(params: { error, context: Array.isArray(context) ? context : [context], reference: state.reference, + custom, traceIds: traceIds, }; } diff --git a/genkit-tools/common/src/types/eval.ts b/genkit-tools/common/src/types/eval.ts index 9cbdd73f47..56590889ae 100644 --- a/genkit-tools/common/src/types/eval.ts +++ b/genkit-tools/common/src/types/eval.ts @@ -134,6 +134,7 @@ export const EvalInputSchema = z.object({ error: z.string().optional(), context: z.array(z.any()).optional(), reference: z.any().optional(), + custom: z.record(z.string(), z.any()).optional(), traceIds: z.array(z.string()), }); export type EvalInput = z.infer; diff --git a/js/ai/src/evaluator.ts b/js/ai/src/evaluator.ts index 98db367fb4..c65836f0f5 100644 --- a/js/ai/src/evaluator.ts +++ b/js/ai/src/evaluator.ts @@ -29,6 +29,7 @@ export const BaseDataPointSchema = z.object({ output: z.unknown().optional(), context: z.array(z.unknown()).optional(), reference: z.unknown().optional(), + custom: z.record(z.string(), z.unknown()).optional(), testCaseId: z.string().optional(), traceIds: z.array(z.string()).optional(), }); diff --git a/js/testapps/evals/prompts/hello.prompt b/js/testapps/evals/prompts/hello.prompt index 271e898f5d..0ef26c9c6d 100644 --- a/js/testapps/evals/prompts/hello.prompt +++ b/js/testapps/evals/prompts/hello.prompt @@ -4,9 +4,11 @@ config: temperature: 0.75 input: schema: - firstName: string - lastName: string - persona: string + query: string --- -You are a {{persona}}. Say hello to {{firstName}} {{lastName}}. +{{role "system"}} +Only write code, do not explain + +{{role "user"}} +Assist the user with: {{query}} \ No newline at end of file From b8ac7b205681248fa800c2c9e8d786e7715cbd90 Mon Sep 17 00:00:00 2001 From: Samuel Bushi Date: Tue, 19 Aug 2025 16:49:45 -0400 Subject: [PATCH 4/4] small fixes --- genkit-tools/common/src/eval/evaluate.ts | 47 ++++++++++++++---------- js/ai/src/evaluator.ts | 1 - 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/genkit-tools/common/src/eval/evaluate.ts b/genkit-tools/common/src/eval/evaluate.ts index 7f68d76930..ddf99e3361 100644 --- a/genkit-tools/common/src/eval/evaluate.ts +++ b/genkit-tools/common/src/eval/evaluate.ts @@ -15,6 +15,7 @@ */ import { randomUUID } from 'crypto'; +import { z } from 'zod'; import { getDatasetStore, getEvalStore } from '.'; import type { RuntimeManager } from '../manager/manager'; import { @@ -279,8 +280,7 @@ async function bulkRunAction(params: { modelConfig: actionConfig, }) ); - } - if (actionType === 'flow') { + } else if (actionType === 'flow') { states.push( await runFlowAction({ manager, @@ -290,13 +290,14 @@ async function bulkRunAction(params: { }) ); } else { + // executable-prompt action states.push( await runPromptAction({ manager, actionRef, sample, context, - modelConfig: actionConfig, + promptConfig: actionConfig, }) ); } @@ -378,21 +379,21 @@ async function runPromptAction(params: { actionRef: string; sample: FullInferenceSample; context?: any; - modelConfig?: any; + promptConfig?: any; }): Promise { - const { manager, actionRef, sample, context, modelConfig } = { ...params }; + const { manager, actionRef, sample, context, promptConfig } = { ...params }; - const { model: modelFromConfig, ...restOfConfig } = modelConfig ?? {}; + const { model: modelFromConfig, ...restOfConfig } = promptConfig ?? {}; const model = await resolveModel({ manager, actionRef, modelFromConfig }); if (!model) { throw new Error( - 'Could not resolve model. Please provide model in the prompt file or through the `actionConfig.model` field.' + 'Could not resolve model. Please specify model and try again' ); } let state: InferenceRunState; let renderedPrompt: { result: GenerateActionOptions; - traceId?: string; + traceId: string; }; // Step 1. Render the prompt with inputs try { @@ -403,21 +404,29 @@ async function runPromptAction(params: { }); renderedPrompt = { - traceId: runActionResponse.telemetry?.traceId, - result: runActionResponse.result as GenerateActionOptions, + traceId: runActionResponse.telemetry?.traceId!, + result: GenerateActionOptionsSchema.parse(runActionResponse.result), }; } catch (e: any) { - const traceId = e?.data?.details?.traceId; - state = { - ...sample, - traceIds: traceId ? [traceId] : [], - evalError: `Error when rendering prompt. Details: ${e?.message ?? e}`, - }; + if (e instanceof z.ZodError) { + state = { + ...sample, + traceIds: [], + evalError: `Error parsing prompt response. Details: ${JSON.stringify(e.format())}`, + }; + } else { + const traceId = e?.data?.details?.traceId; + state = { + ...sample, + traceIds: traceId ? [traceId] : [], + evalError: `Error when rendering prompt. Details: ${e?.message ?? e}`, + }; + } return state; } // Step 2. Run rendered prompt on the model try { - let modelInput = GenerateActionOptionsSchema.parse(renderedPrompt.result); + let modelInput = renderedPrompt.result; if (restOfConfig) { modelInput = { ...modelInput, config: restOfConfig }; } @@ -430,7 +439,7 @@ async function runPromptAction(params: { : [renderedPrompt.traceId]; state = { ...sample, - traceIds: traceIds.filter((t): t is string => !!t), + traceIds: traceIds, response: runActionResponse.result, }; } catch (e: any) { @@ -536,12 +545,12 @@ async function resolveModel(params: { }) { const { manager, actionRef, modelFromConfig } = { ...params }; - const actionData = await getAction({ manager, actionRef }); // Prefer to use modelFromConfig if (modelFromConfig) { return modelFromConfig; } + const actionData = await getAction({ manager, actionRef }); const promptMetadata = actionData?.metadata?.prompt as any; return promptMetadata?.model ? `/model/${promptMetadata?.model}` : undefined; } diff --git a/js/ai/src/evaluator.ts b/js/ai/src/evaluator.ts index c2c8bc2b93..1c2146ea20 100644 --- a/js/ai/src/evaluator.ts +++ b/js/ai/src/evaluator.ts @@ -29,7 +29,6 @@ export const BaseDataPointSchema = z.object({ output: z.unknown().optional(), context: z.array(z.unknown()).optional(), reference: z.unknown().optional(), - custom: z.record(z.string(), z.unknown()).optional(), testCaseId: z.string().optional(), traceIds: z.array(z.string()).optional(), });