dotnet
diff --git a/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Program.cs‎
Lines changed: 1 addition & 1 deletion b/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Program.cs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatConversationEvaluator.cs‎
Lines changed: 4 additions & 5 deletions b/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatConversationEvaluator.cs‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluator.cs‎
Lines changed: 36 additions & 12 deletions b/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluator.cs‎
Lines changed: 36 additions & 12 deletions
diff --git a/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluator.cs‎
Lines changed: 36 additions & 2 deletions b/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluator.cs‎
Lines changed: 36 additions & 2 deletions
diff --git a/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs‎
Lines changed: 4 additions & 4 deletions b/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/SingleNumericMetricEvaluator.cs‎
Lines changed: 2 additions & 2 deletions b/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/SingleNumericMetricEvaluator.cs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ConversationDetails.tsx‎
Lines changed: 37 additions & 2 deletions b/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ConversationDetails.tsx‎
Lines changed: 37 additions & 2 deletions
diff --git a/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/EvalTypes.d.ts‎
Lines changed: 3 additions & 0 deletions b/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/EvalTypes.d.ts‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScoreDetail.tsx‎
Lines changed: 1 addition & 1 deletion b/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScoreDetail.tsx‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/Styles.ts‎
Lines changed: 8 additions & 0 deletions b/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/Styles.ts‎
Lines changed: 8 additions & 0 deletions
@@ -90,7 +90,7 @@ private static async Task<int> Main(string[] args)
 
         var formatOpt =
             new Option<ReportCommand.Format>(
-                "--format",
+                ["-f", "--format"],
                 () => ReportCommand.Format.html,
                 "Specify the format for the generated report.");
 
 
@@ -1,7 +1,6 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
-using System;
 using System.Collections.Generic;
 using System.Linq;
 using System.Text;
@@ -35,7 +34,7 @@ public abstract class ChatConversationEvaluator : IEvaluator
     protected virtual string? SystemPrompt => null;
 
     /// <inheritdoc/>
-    public async ValueTask<EvaluationResult> EvaluateAsync(
+    public virtual async ValueTask<EvaluationResult> EvaluateAsync(
         IEnumerable<ChatMessage> messages,
         ChatResponse modelResponse,
         ChatConfiguration? chatConfiguration = null,
@@ -49,7 +48,7 @@ public async ValueTask<EvaluationResult> EvaluateAsync(
 
         if (string.IsNullOrWhiteSpace(modelResponse.Text))
         {
-            result.AddDiagnosticToAllMetrics(
+            result.AddDiagnosticsToAllMetrics(
                 EvaluationDiagnostic.Error(
                     "Evaluation failed because the model response supplied for evaluation was null or empty."));
 
@@ -73,7 +72,7 @@ void OnTokenBudgetExceeded()
                     EvaluationDiagnostic.Error(
                         $"Evaluation failed because the specified limit of {inputTokenLimit} input tokens was exceeded.");
 
-                result.AddDiagnosticToAllMetrics(tokenBudgetExceeded);
+                result.AddDiagnosticsToAllMetrics(tokenBudgetExceeded);
             }
 
             if (!string.IsNullOrWhiteSpace(SystemPrompt))
@@ -176,7 +175,7 @@ await PerformEvaluationAsync(
         if (inputTokenLimit > 0 && ignoredMessagesCount > 0)
         {
 #pragma warning disable S103 // Lines should not be too long
-            result.AddDiagnosticToAllMetrics(
+            result.AddDiagnosticsToAllMetrics(
                 EvaluationDiagnostic.Warning(
                     $"The evaluation may be inconclusive because the oldest {ignoredMessagesCount} messages in the supplied conversation history were ignored in order to stay under the specified limit of {inputTokenLimit} input tokens."));
 #pragma warning restore S103
 
@@ -49,6 +49,28 @@ public sealed class EquivalenceEvaluator : SingleNumericMetricEvaluator
     /// <inheritdoc/>
     protected override bool IgnoresHistory => true;
 
+    /// <inheritdoc/>
+    public override async ValueTask<EvaluationResult> EvaluateAsync(
+        IEnumerable<ChatMessage> messages,
+        ChatResponse modelResponse,
+        ChatConfiguration? chatConfiguration = null,
+        IEnumerable<EvaluationContext>? additionalContext = null,
+        CancellationToken cancellationToken = default)
+    {
+        EvaluationResult result =
+            await base.EvaluateAsync(
+                messages,
+                modelResponse,
+                chatConfiguration,
+                additionalContext,
+                cancellationToken).ConfigureAwait(false);
+
+        EquivalenceEvaluatorContext context = GetRelevantContext(additionalContext);
+        result.AddOrUpdateContextInAllMetrics("Ground Truth", context.GetContents());
+
+        return result;
+    }
+
     /// <inheritdoc/>
     protected override async ValueTask<string> RenderEvaluationPromptAsync(
         ChatMessage? userRequest,
@@ -66,18 +88,8 @@ userRequest is not null
                 ? await RenderAsync(userRequest, cancellationToken).ConfigureAwait(false)
                 : string.Empty;
 
-        string groundTruth;
-
-        if (additionalContext?.OfType<EquivalenceEvaluatorContext>().FirstOrDefault()
-                is EquivalenceEvaluatorContext context)
-        {
-            groundTruth = context.GroundTruth;
-        }
-        else
-        {
-            throw new InvalidOperationException(
-                $"A value of type '{nameof(EquivalenceEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.");
-        }
+        EquivalenceEvaluatorContext context = GetRelevantContext(additionalContext);
+        string groundTruth = context.GroundTruth;
 
         string prompt =
             $$"""
@@ -149,4 +161,16 @@ alleviating stress and augmenting general mood.
 
         return prompt;
     }
+
+    private static EquivalenceEvaluatorContext GetRelevantContext(IEnumerable<EvaluationContext>? additionalContext)
+    {
+        if (additionalContext?.OfType<EquivalenceEvaluatorContext>().FirstOrDefault()
+                is EquivalenceEvaluatorContext context)
+        {
+            return context;
+        }
+
+        throw new InvalidOperationException(
+            $"A value of type '{nameof(EquivalenceEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.");
+    }
 }
@@ -49,6 +49,30 @@ public sealed class GroundednessEvaluator : SingleNumericMetricEvaluator
     /// <inheritdoc/>
     protected override bool IgnoresHistory => false;
 
+    /// <inheritdoc/>
+    public override async ValueTask<EvaluationResult> EvaluateAsync(
+        IEnumerable<ChatMessage> messages,
+        ChatResponse modelResponse,
+        ChatConfiguration? chatConfiguration = null,
+        IEnumerable<EvaluationContext>? additionalContext = null,
+        CancellationToken cancellationToken = default)
+    {
+        EvaluationResult result =
+            await base.EvaluateAsync(
+                messages,
+                modelResponse,
+                chatConfiguration,
+                additionalContext,
+                cancellationToken).ConfigureAwait(false);
+
+        if (GetRelevantContext(additionalContext) is GroundednessEvaluatorContext context)
+        {
+            result.AddOrUpdateContextInAllMetrics("Grounding Context", context.GetContents());
+        }
+
+        return result;
+    }
+
     /// <inheritdoc/>
     protected override async ValueTask<string> RenderEvaluationPromptAsync(
         ChatMessage? userRequest,
@@ -68,8 +92,7 @@ userRequest is not null
 
         var builder = new StringBuilder();
 
-        if (additionalContext?.OfType<GroundednessEvaluatorContext>().FirstOrDefault()
-                is GroundednessEvaluatorContext context)
+        if (GetRelevantContext(additionalContext) is GroundednessEvaluatorContext context)
         {
             _ = builder.Append(context.GroundingContext);
             _ = builder.AppendLine();
@@ -162,4 +185,15 @@ is not French.
 
         return prompt;
     }
+
+    private static GroundednessEvaluatorContext? GetRelevantContext(IEnumerable<EvaluationContext>? additionalContext)
+    {
+        if (additionalContext?.OfType<GroundednessEvaluatorContext>().FirstOrDefault()
+                is GroundednessEvaluatorContext context)
+        {
+            return context;
+        }
+
+        return null;
+    }
 }
@@ -145,7 +145,7 @@ await chatConfiguration.ChatClient.GetResponseAsync(
             if (string.IsNullOrEmpty(evaluationResponseText))
             {
                 rating = Rating.Inconclusive;
-                result.AddDiagnosticToAllMetrics(
+                result.AddDiagnosticsToAllMetrics(
                     EvaluationDiagnostic.Error(
                         "Evaluation failed because the model failed to produce a valid evaluation response."));
             }
@@ -168,7 +168,7 @@ await JsonOutputFixer.RepairJsonAsync(
                         if (string.IsNullOrEmpty(repairedJson))
                         {
                             rating = Rating.Inconclusive;
-                            result.AddDiagnosticToAllMetrics(
+                            result.AddDiagnosticsToAllMetrics(
                                 EvaluationDiagnostic.Error(
                                     $"""
                                     Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}'.:
@@ -183,7 +183,7 @@ await JsonOutputFixer.RepairJsonAsync(
                     catch (JsonException ex)
                     {
                         rating = Rating.Inconclusive;
-                        result.AddDiagnosticToAllMetrics(
+                        result.AddDiagnosticsToAllMetrics(
                             EvaluationDiagnostic.Error(
                                 $"""
                                 Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}'.:
@@ -281,7 +281,7 @@ void UpdateResult()
 
             if (!string.IsNullOrWhiteSpace(rating.Error))
             {
-                result.AddDiagnosticToAllMetrics(EvaluationDiagnostic.Error(rating.Error!));
+                result.AddDiagnosticsToAllMetrics(EvaluationDiagnostic.Error(rating.Error!));
             }
         }
     }
 
@@ -105,7 +105,7 @@ await chatConfiguration.ChatClient.GetResponseAsync(
 
             if (string.IsNullOrEmpty(evaluationResponseText))
             {
-                metric.AddDiagnostic(
+                metric.AddDiagnostics(
                     EvaluationDiagnostic.Error(
                         "Evaluation failed because the model failed to produce a valid evaluation response."));
             }
@@ -115,7 +115,7 @@ await chatConfiguration.ChatClient.GetResponseAsync(
             }
             else
             {
-                metric.AddDiagnostic(
+                metric.AddDiagnostics(
                     EvaluationDiagnostic.Error(
                         $"Failed to parse '{evaluationResponseText!}' as an integer score for '{MetricName}'."));
             }
 
@@ -8,11 +8,13 @@ import ReactMarkdown from "react-markdown";
 import { useReportContext } from "./ReportContext";
 import { useStyles } from "./Styles";
 import { ChatMessageDisplay, isTextContent, isImageContent } from "./Summary";
+import type { MetricType } from "./MetricCard";
 
-export const ConversationDetails = ({ messages, model, usage }: {
+export const ConversationDetails = ({ messages, model, usage, selectedMetric }: {
     messages: ChatMessageDisplay[];
     model?: string;
     usage?: UsageDetails;
+    selectedMetric?: MetricType | null;
 }) => {
     const classes = useStyles();
     const [isExpanded, setIsExpanded] = useState(true);
@@ -59,7 +61,27 @@ export const ConversationDetails = ({ messages, model, usage }: {
         return result;
     };
 
+    const getContextGroups = () => {
+        if (!selectedMetric || !selectedMetric.context) {
+            return [];
+        }
+
+        const contextGroups: { key: string, contents: AIContent[] }[] = [];
+        
+        for (const [key, contents] of Object.entries(selectedMetric.context)) {
+            if (contents && contents.length > 0) {
+                contextGroups.push({
+                    key: key.toLowerCase(),
+                    contents: contents
+                });
+            }
+        }
+
+        return contextGroups;
+    };
+
     const messageGroups = groupMessages();
+    const contextGroups = getContextGroups();
 
     return (
         <div className={classes.section}>
@@ -79,7 +101,7 @@ export const ConversationDetails = ({ messages, model, usage }: {
                         );
 
                         return (
-                            <div key={index} className={messageRowClass}>
+                            <div key={`msg-${index}`} className={messageRowClass}>
                                 <div className={classes.messageParticipantName}>{group.participantName}</div>
                                 <div className={classes.messageBubble}>
                                     {group.contents.map((content, contentIndex) => (
@@ -91,6 +113,19 @@ export const ConversationDetails = ({ messages, model, usage }: {
                             </div>
                         );
                     })}
+                    
+                    {contextGroups.map((group, index) => (
+                        <div key={`context-${index}`} className={mergeClasses(classes.messageRow, classes.userMessageRow)}>
+                            <div className={classes.messageParticipantName}>{`supplied evaluation context (${group.key})`}</div>
+                            <div className={classes.contextBubble}>
+                                {group.contents.map((content, contentIndex) => (
+                                    <div key={contentIndex}>
+                                        {renderContent(content)}
+                                    </div>
+                                ))}
+                            </div>
+                        </div>
+                    ))}
                 </div>
             )}
         </div>
 
@@ -94,6 +94,9 @@ type BaseEvaluationMetric = {
     $type: string;
     name: string;
     interpretation?: EvaluationMetricInterpretation;
+    context?: {
+        [K: string]: AIContent[]
+    };
     diagnostics?: EvaluationDiagnostic[];
     metadata: { 
         [K: string]: string 
 
@@ -32,7 +32,7 @@ export const ScoreDetail = ({ scenario, scoreSummary }: { scenario: ScenarioRunR
             onMetricSelect={setSelectedMetric}
             selectedMetric={selectedMetric} />
         {selectedMetric && <MetricDetailsSection metric={selectedMetric} />}
-        <ConversationDetails messages={messages} model={model} usage={usage} />
+        <ConversationDetails messages={messages} model={model} usage={usage} selectedMetric={selectedMetric} />
         {scenario.chatDetails && scenario.chatDetails.turnDetails.length > 0 && <ChatDetailsSection chatDetails={scenario.chatDetails} />}
     </div>);
 };
@@ -127,6 +127,14 @@ export const useStyles = makeStyles({
         backgroundColor: tokens.colorNeutralBackground3,
         border: '1px solid ' + tokens.colorNeutralStroke2,
     },
+    contextBubble: {
+        padding: '0.75rem 1rem',
+        borderRadius: '12px',
+        overflow: 'hidden',
+        wordBreak: 'break-word',
+        backgroundColor: tokens.colorBrandBackground2,
+        border: '1px solid ' + tokens.colorNeutralStroke2,
+    },
     cacheHitIcon: {
         color: tokens.colorPaletteGreenForeground1,
     },
Original file line number	Diff line number	Diff line change
`@@ -145,7 +145,7 @@ await chatConfiguration.ChatClient.GetResponseAsync(`
`145`	`145`	`if (string.IsNullOrEmpty(evaluationResponseText))`
`146`	`146`	`{`
`147`	`147`	`rating = Rating.Inconclusive;`
`148`		`- result.AddDiagnosticToAllMetrics(`
	`148`	`+ result.AddDiagnosticsToAllMetrics(`
`149`	`149`	`EvaluationDiagnostic.Error(`
`150`	`150`	`"Evaluation failed because the model failed to produce a valid evaluation response."));`
`151`	`151`	`}`
`@@ -168,7 +168,7 @@ await JsonOutputFixer.RepairJsonAsync(`
`168`	`168`	`if (string.IsNullOrEmpty(repairedJson))`
`169`	`169`	`{`
`170`	`170`	`rating = Rating.Inconclusive;`
`171`		`- result.AddDiagnosticToAllMetrics(`
	`171`	`+ result.AddDiagnosticsToAllMetrics(`
`172`	`172`	`EvaluationDiagnostic.Error(`
`173`	`173`	`$"""`
`174`	`174`	`Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}'.:`
`@@ -183,7 +183,7 @@ await JsonOutputFixer.RepairJsonAsync(`
`183`	`183`	`catch (JsonException ex)`
`184`	`184`	`{`
`185`	`185`	`rating = Rating.Inconclusive;`
`186`		`- result.AddDiagnosticToAllMetrics(`
	`186`	`+ result.AddDiagnosticsToAllMetrics(`
`187`	`187`	`EvaluationDiagnostic.Error(`
`188`	`188`	`$"""`
`189`	`189`	`Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}'.:`
`@@ -281,7 +281,7 @@ void UpdateResult()`
`281`	`281`
`282`	`282`	`if (!string.IsNullOrWhiteSpace(rating.Error))`
`283`	`283`	`{`
`284`		`- result.AddDiagnosticToAllMetrics(EvaluationDiagnostic.Error(rating.Error!));`
	`284`	`+ result.AddDiagnosticsToAllMetrics(EvaluationDiagnostic.Error(rating.Error!));`
`285`	`285`	`}`
`286`	`286`	`}`
`287`	`287`	`}`
Original file line number	Diff line number	Diff line change
`@@ -105,7 +105,7 @@ await chatConfiguration.ChatClient.GetResponseAsync(`
`105`	`105`
`106`	`106`	`if (string.IsNullOrEmpty(evaluationResponseText))`
`107`	`107`	`{`
`108`		`- metric.AddDiagnostic(`
	`108`	`+ metric.AddDiagnostics(`
`109`	`109`	`EvaluationDiagnostic.Error(`
`110`	`110`	`"Evaluation failed because the model failed to produce a valid evaluation response."));`
`111`	`111`	`}`
`@@ -115,7 +115,7 @@ await chatConfiguration.ChatClient.GetResponseAsync(`
`115`	`115`	`}`
`116`	`116`	`else`
`117`	`117`	`{`
`118`		`- metric.AddDiagnostic(`
	`118`	`+ metric.AddDiagnostics(`
`119`	`119`	`EvaluationDiagnostic.Error(`
`120`	`120`	`$"Failed to parse '{evaluationResponseText!}' as an integer score for '{MetricName}'."));`
`121`	`121`	`}`