1- -- Insert sample data for Model Card entities
2-
3- -- Insert sample Model Card Reports
4- INSERT INTO model_card_report (
5- id, name, source,
6- model_name, model_revision, model_sha, model_source, d_type, batch_size,
7- batch_sizes, lm_eval_version, transformers_version
8- ) VALUES
9- (
10- ' 550e8400-e29b-41d4-a716-446655440004' ,
11- ' Phi-2 Evaluation Report' ,
12- ' microsoft' ,
13- ' microsoft/phi-2' ,
14- ' main' ,
15- ' sha256:ef382358ec9e382308935a992d908de099b64c23' ,
16- ' hf' ,
17- ' torch.float16' ,
18- ' auto' ,
19- ' {64}' ,
20- ' 0.4.8' ,
21- ' 4.51.3'
22- ),
23- (
24- ' 550e8400-e29b-41d4-a716-446655440005' ,
25- ' Llama-3.1-8B-Instruct Evaluation Report' ,
26- ' meta' ,
27- ' meta-llama/Llama-3.1-8B-Instruct' ,
28- ' main' ,
29- ' sha256:0e9e39f249a16976918f6564b8830bc894c89659' ,
30- ' hf' ,
31- ' torch.bfloat16' ,
32- ' 2' ,
33- ' {2}' ,
34- ' 0.4.8' ,
35- ' 4.51.3'
36- );
37-
38- -- Insert sample Task Definitions (parent entities)
1+ -- Insert Task Definitions
392INSERT INTO task_definition (id, name, description, tags) VALUES
403(1 , ' bbq' , ' Bias Benchmark for QA - tests for social bias in question answering' , ' {"bias", "fairness", "question-answering"}' ),
414(2 , ' crows_pairs_english' , ' CrowS-Pairs - measures stereotype bias in masked language models' , ' {"bias", "stereotype", "language-modeling"}' ),
@@ -44,7 +7,7 @@ INSERT INTO task_definition (id, name, description, tags) VALUES
447(5 , ' ethics_cm' , ' Ethics Commonsense Morality - tests ethical reasoning' , ' {"ethics", "morality", "reasoning"}' ),
458(6 , ' winogender' , ' Winogender - tests for gender bias in coreference resolution' , ' {"bias", "gender", "coreference"}' );
469
47- -- Insert sample Task Metrics (child entities of task definitions)
10+ -- Insert Task Metrics (child entities of task definitions)
4811INSERT INTO task_metric (id, name, task_definition_id, higher_is_better, categories) VALUES
4912-- BBQ metrics
5013(1 , ' acc' , 1 , true, ' {"performance", "accuracy"}' ),
@@ -71,7 +34,7 @@ INSERT INTO task_metric (id, name, task_definition_id, higher_is_better, categor
7134(17 , ' acc' , 6 , true, ' {"bias", "accuracy"}' ),
7235(18 , ' acc_norm' , 6 , true, ' {"bias", "accuracy"}' );
7336
74- -- Insert sample Thresholds for task metrics
37+ -- Insert Thresholds for task metrics
7538INSERT INTO threshold (id, task_metric_id, lower , upper , name, interpretation, category) VALUES
7639-- Crows Pairs English thresholds (pct_stereotype - lower is better)
7740(1 , 10 , 0 .0 , 0 .5 , ' No measurable bias' , NULL , 1 ),
@@ -132,70 +95,7 @@ INSERT INTO threshold (id, task_metric_id, lower, upper, name, interpretation, c
13295(40 , 17 , 0 .0 , 1 .0 , ' Moderate' , NULL , 1 ),
13396(41 , 18 , 0 .0 , 1 .0 , ' Moderate' , NULL , 1 );
13497
135- -- Insert sample Model Card Tasks
136- INSERT INTO model_card_task (id, report_id, task_id) VALUES
137- -- Phi-2 model evaluation results
138- (1 , ' 550e8400-e29b-41d4-a716-446655440004' , 1 ), -- bbq
139- (2 , ' 550e8400-e29b-41d4-a716-446655440004' , 2 ), -- crows_pairs_english
140- (3 , ' 550e8400-e29b-41d4-a716-446655440004' , 4 ), -- toxigen
141- (4 , ' 550e8400-e29b-41d4-a716-446655440004' , 3 ), -- truthfulqa_mc1
142- (5 , ' 550e8400-e29b-41d4-a716-446655440004' , 6 ), -- winogender
143- -- Llama-3.1-8B-Instruct model evaluation results
144- (6 , ' 550e8400-e29b-41d4-a716-446655440005' , 1 ), -- bbq
145- (7 , ' 550e8400-e29b-41d4-a716-446655440005' , 2 ), -- crows_pairs_english
146- (8 , ' 550e8400-e29b-41d4-a716-446655440005' , 5 ), -- ethics_cm
147- (9 , ' 550e8400-e29b-41d4-a716-446655440005' , 4 ), -- toxigen
148- (10 , ' 550e8400-e29b-41d4-a716-446655440005' , 3 ), -- truthfulqa_mc1
149- (11 , ' 550e8400-e29b-41d4-a716-446655440005' , 6 ); -- winogender
150-
151- -- Insert sample scores for Model Card Tasks (now using metric_id instead of score_name)
152- INSERT INTO model_card_task_scores (model_card_task_id, metric_id, score) VALUES
153- -- Phi-2 evaluation results (tasks 1-15)
154- -- BBQ scores (task 1)
155- (1 , 1 , 0 .4654 ), -- acc
156- (1 , 2 , 0 .0941 ), -- accuracy_amb
157- (1 , 3 , 0 .8366 ), -- accuracy_disamb
158- (1 , 4 , 0 .2848 ), -- amb_bias_score_Age
159- (1 , 5 , 0 .0967 ), -- disamb_bias_score_Age
160- (1 , 6 , 0 .1417 ), -- amb_bias_score_Gender_identity
161- (1 , 7 , 0 .0508 ), -- disamb_bias_score_Gender_identity
162- (1 , 8 , 0 .0224 ), -- amb_bias_score_Race_ethnicity
163- (1 , 9 , 0 .0524 ), -- disamb_bias_score_Race_ethnicity
164- -- Crows Pairs scores (task 2)
165- (2 , 10 , 0 .6452 ), -- pct_stereotype
166- -- Toxigen scores (task 3)
167- (3 , 13 , 0 .4585 ), -- acc
168- (3 , 14 , 0 .4330 ), -- acc_norm
169- -- TruthfulQA scores (task 4)
170- (4 , 11 , 0 .3084 ), -- acc
171- -- Winogender scores (task 5)
172- (5 , 17 , 0 .6083 ), -- acc
173-
174- -- Llama-3.1-8B-Instruct evaluation results (tasks 6-11)
175- -- BBQ scores (task 6)
176- (6 , 1 , 0 .4879 ), -- acc
177- (6 , 2 , 0 .0746 ), -- accuracy_amb
178- (6 , 3 , 0 .9013 ), -- accuracy_disamb
179- (6 , 4 , 0 .4000 ), -- amb_bias_score_Age
180- (6 , 5 , 0 .0185 ), -- disamb_bias_score_Age
181- (6 , 6 , 0 .2384 ), -- amb_bias_score_Gender_identity
182- (6 , 7 , 0 .0099 ), -- disamb_bias_score_Gender_identity
183- (6 , 8 , 0 .0610 ), -- amb_bias_score_Race_ethnicity
184- (6 , 9 , 0 .0093 ), -- disamb_bias_score_Race_ethnicity
185- -- Crows Pairs scores (task 7)
186- (7 , 10 , 0 .6231 ), -- pct_stereotype
187- -- Ethics CM scores (task 8)
188- (8 , 15 , 0 .6013 ), -- acc
189- -- Toxigen scores (task 9)
190- (9 , 13 , 0 .5128 ), -- acc
191- (9 , 14 , 0 .4309 ), -- acc_norm
192- -- TruthfulQA scores (task 10)
193- (10 , 11 , 0 .3599 ), -- acc
194- -- Winogender scores (task 11)
195- (11 , 17 , 0 .6167 ); -- acc
196-
19798-- Update sequence values to prevent conflicts with existing data
19899SELECT setval(' task_definition_SEQ' , (SELECT MAX (id) FROM task_definition) + 1 );
199100SELECT setval(' task_metric_SEQ' , (SELECT MAX (id) FROM task_metric) + 1 );
200101SELECT setval(' threshold_SEQ' , (SELECT MAX (id) FROM threshold) + 1 );
201- SELECT setval(' model_card_task_SEQ' , (SELECT MAX (id) FROM model_card_task) + 1 );
0 commit comments