Skip to content

Commit 6abae3c

Browse files
Introduce Context property on EvaluationMetric (#6316)
This allows evaluators to record contextual information that was used in the evaluation which can then be displayed in the evaluation report. This PR also updates all evaluators that rely on contextual information and that ship as part of the Quality and Safety packages (i.e., `GroundednessEvaluator`, `EquivalenceEvaluator`, `GroundednessProEvaluator` and `UngroundedAttributesEvaluator`) to include contextual information as part of the `EvaluationMetric`s they produce. Also includes some cleanup for extension methods as part of public API stabilization. - Removes extension methods for adding a single diagnostic in favor of overloads that take a `params` array which can be called in the same way. Fixes #6033
1 parent 59f629f commit 6abae3c

File tree

20 files changed

+323
-116
lines changed

20 files changed

+323
-116
lines changed

src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Program.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ private static async Task<int> Main(string[] args)
9090

9191
var formatOpt =
9292
new Option<ReportCommand.Format>(
93-
"--format",
93+
["-f", "--format"],
9494
() => ReportCommand.Format.html,
9595
"Specify the format for the generated report.");
9696

src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatConversationEvaluator.cs

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
// Licensed to the .NET Foundation under one or more agreements.
22
// The .NET Foundation licenses this file to you under the MIT license.
33

4-
using System;
54
using System.Collections.Generic;
65
using System.Linq;
76
using System.Text;
@@ -35,7 +34,7 @@ public abstract class ChatConversationEvaluator : IEvaluator
3534
protected virtual string? SystemPrompt => null;
3635

3736
/// <inheritdoc/>
38-
public async ValueTask<EvaluationResult> EvaluateAsync(
37+
public virtual async ValueTask<EvaluationResult> EvaluateAsync(
3938
IEnumerable<ChatMessage> messages,
4039
ChatResponse modelResponse,
4140
ChatConfiguration? chatConfiguration = null,
@@ -49,7 +48,7 @@ public async ValueTask<EvaluationResult> EvaluateAsync(
4948

5049
if (string.IsNullOrWhiteSpace(modelResponse.Text))
5150
{
52-
result.AddDiagnosticToAllMetrics(
51+
result.AddDiagnosticsToAllMetrics(
5352
EvaluationDiagnostic.Error(
5453
"Evaluation failed because the model response supplied for evaluation was null or empty."));
5554

@@ -73,7 +72,7 @@ void OnTokenBudgetExceeded()
7372
EvaluationDiagnostic.Error(
7473
$"Evaluation failed because the specified limit of {inputTokenLimit} input tokens was exceeded.");
7574

76-
result.AddDiagnosticToAllMetrics(tokenBudgetExceeded);
75+
result.AddDiagnosticsToAllMetrics(tokenBudgetExceeded);
7776
}
7877

7978
if (!string.IsNullOrWhiteSpace(SystemPrompt))
@@ -176,7 +175,7 @@ await PerformEvaluationAsync(
176175
if (inputTokenLimit > 0 && ignoredMessagesCount > 0)
177176
{
178177
#pragma warning disable S103 // Lines should not be too long
179-
result.AddDiagnosticToAllMetrics(
178+
result.AddDiagnosticsToAllMetrics(
180179
EvaluationDiagnostic.Warning(
181180
$"The evaluation may be inconclusive because the oldest {ignoredMessagesCount} messages in the supplied conversation history were ignored in order to stay under the specified limit of {inputTokenLimit} input tokens."));
182181
#pragma warning restore S103

src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluator.cs

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,28 @@ public sealed class EquivalenceEvaluator : SingleNumericMetricEvaluator
4949
/// <inheritdoc/>
5050
protected override bool IgnoresHistory => true;
5151

52+
/// <inheritdoc/>
53+
public override async ValueTask<EvaluationResult> EvaluateAsync(
54+
IEnumerable<ChatMessage> messages,
55+
ChatResponse modelResponse,
56+
ChatConfiguration? chatConfiguration = null,
57+
IEnumerable<EvaluationContext>? additionalContext = null,
58+
CancellationToken cancellationToken = default)
59+
{
60+
EvaluationResult result =
61+
await base.EvaluateAsync(
62+
messages,
63+
modelResponse,
64+
chatConfiguration,
65+
additionalContext,
66+
cancellationToken).ConfigureAwait(false);
67+
68+
EquivalenceEvaluatorContext context = GetRelevantContext(additionalContext);
69+
result.AddOrUpdateContextInAllMetrics("Ground Truth", context.GetContents());
70+
71+
return result;
72+
}
73+
5274
/// <inheritdoc/>
5375
protected override async ValueTask<string> RenderEvaluationPromptAsync(
5476
ChatMessage? userRequest,
@@ -66,18 +88,8 @@ userRequest is not null
6688
? await RenderAsync(userRequest, cancellationToken).ConfigureAwait(false)
6789
: string.Empty;
6890

69-
string groundTruth;
70-
71-
if (additionalContext?.OfType<EquivalenceEvaluatorContext>().FirstOrDefault()
72-
is EquivalenceEvaluatorContext context)
73-
{
74-
groundTruth = context.GroundTruth;
75-
}
76-
else
77-
{
78-
throw new InvalidOperationException(
79-
$"A value of type '{nameof(EquivalenceEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.");
80-
}
91+
EquivalenceEvaluatorContext context = GetRelevantContext(additionalContext);
92+
string groundTruth = context.GroundTruth;
8193

8294
string prompt =
8395
$$"""
@@ -149,4 +161,16 @@ alleviating stress and augmenting general mood.
149161

150162
return prompt;
151163
}
164+
165+
private static EquivalenceEvaluatorContext GetRelevantContext(IEnumerable<EvaluationContext>? additionalContext)
166+
{
167+
if (additionalContext?.OfType<EquivalenceEvaluatorContext>().FirstOrDefault()
168+
is EquivalenceEvaluatorContext context)
169+
{
170+
return context;
171+
}
172+
173+
throw new InvalidOperationException(
174+
$"A value of type '{nameof(EquivalenceEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.");
175+
}
152176
}

src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluator.cs

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,30 @@ public sealed class GroundednessEvaluator : SingleNumericMetricEvaluator
4949
/// <inheritdoc/>
5050
protected override bool IgnoresHistory => false;
5151

52+
/// <inheritdoc/>
53+
public override async ValueTask<EvaluationResult> EvaluateAsync(
54+
IEnumerable<ChatMessage> messages,
55+
ChatResponse modelResponse,
56+
ChatConfiguration? chatConfiguration = null,
57+
IEnumerable<EvaluationContext>? additionalContext = null,
58+
CancellationToken cancellationToken = default)
59+
{
60+
EvaluationResult result =
61+
await base.EvaluateAsync(
62+
messages,
63+
modelResponse,
64+
chatConfiguration,
65+
additionalContext,
66+
cancellationToken).ConfigureAwait(false);
67+
68+
if (GetRelevantContext(additionalContext) is GroundednessEvaluatorContext context)
69+
{
70+
result.AddOrUpdateContextInAllMetrics("Grounding Context", context.GetContents());
71+
}
72+
73+
return result;
74+
}
75+
5276
/// <inheritdoc/>
5377
protected override async ValueTask<string> RenderEvaluationPromptAsync(
5478
ChatMessage? userRequest,
@@ -68,8 +92,7 @@ userRequest is not null
6892

6993
var builder = new StringBuilder();
7094

71-
if (additionalContext?.OfType<GroundednessEvaluatorContext>().FirstOrDefault()
72-
is GroundednessEvaluatorContext context)
95+
if (GetRelevantContext(additionalContext) is GroundednessEvaluatorContext context)
7396
{
7497
_ = builder.Append(context.GroundingContext);
7598
_ = builder.AppendLine();
@@ -162,4 +185,15 @@ is not French.
162185

163186
return prompt;
164187
}
188+
189+
private static GroundednessEvaluatorContext? GetRelevantContext(IEnumerable<EvaluationContext>? additionalContext)
190+
{
191+
if (additionalContext?.OfType<GroundednessEvaluatorContext>().FirstOrDefault()
192+
is GroundednessEvaluatorContext context)
193+
{
194+
return context;
195+
}
196+
197+
return null;
198+
}
165199
}

src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ await chatConfiguration.ChatClient.GetResponseAsync(
145145
if (string.IsNullOrEmpty(evaluationResponseText))
146146
{
147147
rating = Rating.Inconclusive;
148-
result.AddDiagnosticToAllMetrics(
148+
result.AddDiagnosticsToAllMetrics(
149149
EvaluationDiagnostic.Error(
150150
"Evaluation failed because the model failed to produce a valid evaluation response."));
151151
}
@@ -168,7 +168,7 @@ await JsonOutputFixer.RepairJsonAsync(
168168
if (string.IsNullOrEmpty(repairedJson))
169169
{
170170
rating = Rating.Inconclusive;
171-
result.AddDiagnosticToAllMetrics(
171+
result.AddDiagnosticsToAllMetrics(
172172
EvaluationDiagnostic.Error(
173173
$"""
174174
Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}'.:
@@ -183,7 +183,7 @@ await JsonOutputFixer.RepairJsonAsync(
183183
catch (JsonException ex)
184184
{
185185
rating = Rating.Inconclusive;
186-
result.AddDiagnosticToAllMetrics(
186+
result.AddDiagnosticsToAllMetrics(
187187
EvaluationDiagnostic.Error(
188188
$"""
189189
Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}'.:
@@ -281,7 +281,7 @@ void UpdateResult()
281281

282282
if (!string.IsNullOrWhiteSpace(rating.Error))
283283
{
284-
result.AddDiagnosticToAllMetrics(EvaluationDiagnostic.Error(rating.Error!));
284+
result.AddDiagnosticsToAllMetrics(EvaluationDiagnostic.Error(rating.Error!));
285285
}
286286
}
287287
}

src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/SingleNumericMetricEvaluator.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ await chatConfiguration.ChatClient.GetResponseAsync(
105105

106106
if (string.IsNullOrEmpty(evaluationResponseText))
107107
{
108-
metric.AddDiagnostic(
108+
metric.AddDiagnostics(
109109
EvaluationDiagnostic.Error(
110110
"Evaluation failed because the model failed to produce a valid evaluation response."));
111111
}
@@ -115,7 +115,7 @@ await chatConfiguration.ChatClient.GetResponseAsync(
115115
}
116116
else
117117
{
118-
metric.AddDiagnostic(
118+
metric.AddDiagnostics(
119119
EvaluationDiagnostic.Error(
120120
$"Failed to parse '{evaluationResponseText!}' as an integer score for '{MetricName}'."));
121121
}

src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ConversationDetails.tsx

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,13 @@ import ReactMarkdown from "react-markdown";
88
import { useReportContext } from "./ReportContext";
99
import { useStyles } from "./Styles";
1010
import { ChatMessageDisplay, isTextContent, isImageContent } from "./Summary";
11+
import type { MetricType } from "./MetricCard";
1112

12-
export const ConversationDetails = ({ messages, model, usage }: {
13+
export const ConversationDetails = ({ messages, model, usage, selectedMetric }: {
1314
messages: ChatMessageDisplay[];
1415
model?: string;
1516
usage?: UsageDetails;
17+
selectedMetric?: MetricType | null;
1618
}) => {
1719
const classes = useStyles();
1820
const [isExpanded, setIsExpanded] = useState(true);
@@ -59,7 +61,27 @@ export const ConversationDetails = ({ messages, model, usage }: {
5961
return result;
6062
};
6163

64+
const getContextGroups = () => {
65+
if (!selectedMetric || !selectedMetric.context) {
66+
return [];
67+
}
68+
69+
const contextGroups: { key: string, contents: AIContent[] }[] = [];
70+
71+
for (const [key, contents] of Object.entries(selectedMetric.context)) {
72+
if (contents && contents.length > 0) {
73+
contextGroups.push({
74+
key: key.toLowerCase(),
75+
contents: contents
76+
});
77+
}
78+
}
79+
80+
return contextGroups;
81+
};
82+
6283
const messageGroups = groupMessages();
84+
const contextGroups = getContextGroups();
6385

6486
return (
6587
<div className={classes.section}>
@@ -79,7 +101,7 @@ export const ConversationDetails = ({ messages, model, usage }: {
79101
);
80102

81103
return (
82-
<div key={index} className={messageRowClass}>
104+
<div key={`msg-${index}`} className={messageRowClass}>
83105
<div className={classes.messageParticipantName}>{group.participantName}</div>
84106
<div className={classes.messageBubble}>
85107
{group.contents.map((content, contentIndex) => (
@@ -91,6 +113,19 @@ export const ConversationDetails = ({ messages, model, usage }: {
91113
</div>
92114
);
93115
})}
116+
117+
{contextGroups.map((group, index) => (
118+
<div key={`context-${index}`} className={mergeClasses(classes.messageRow, classes.userMessageRow)}>
119+
<div className={classes.messageParticipantName}>{`supplied evaluation context (${group.key})`}</div>
120+
<div className={classes.contextBubble}>
121+
{group.contents.map((content, contentIndex) => (
122+
<div key={contentIndex}>
123+
{renderContent(content)}
124+
</div>
125+
))}
126+
</div>
127+
</div>
128+
))}
94129
</div>
95130
)}
96131
</div>

src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/EvalTypes.d.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,9 @@ type BaseEvaluationMetric = {
9494
$type: string;
9595
name: string;
9696
interpretation?: EvaluationMetricInterpretation;
97+
context?: {
98+
[K: string]: AIContent[]
99+
};
97100
diagnostics?: EvaluationDiagnostic[];
98101
metadata: {
99102
[K: string]: string

src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScoreDetail.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ export const ScoreDetail = ({ scenario, scoreSummary }: { scenario: ScenarioRunR
3232
onMetricSelect={setSelectedMetric}
3333
selectedMetric={selectedMetric} />
3434
{selectedMetric && <MetricDetailsSection metric={selectedMetric} />}
35-
<ConversationDetails messages={messages} model={model} usage={usage} />
35+
<ConversationDetails messages={messages} model={model} usage={usage} selectedMetric={selectedMetric} />
3636
{scenario.chatDetails && scenario.chatDetails.turnDetails.length > 0 && <ChatDetailsSection chatDetails={scenario.chatDetails} />}
3737
</div>);
3838
};

src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/Styles.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,14 @@ export const useStyles = makeStyles({
127127
backgroundColor: tokens.colorNeutralBackground3,
128128
border: '1px solid ' + tokens.colorNeutralStroke2,
129129
},
130+
contextBubble: {
131+
padding: '0.75rem 1rem',
132+
borderRadius: '12px',
133+
overflow: 'hidden',
134+
wordBreak: 'break-word',
135+
backgroundColor: tokens.colorBrandBackground2,
136+
border: '1px solid ' + tokens.colorNeutralStroke2,
137+
},
130138
cacheHitIcon: {
131139
color: tokens.colorPaletteGreenForeground1,
132140
},

0 commit comments

Comments
 (0)