-
Notifications
You must be signed in to change notification settings - Fork 1
/
allfiles.txt
executable file
·2854 lines (2854 loc) · 245 KB
/
allfiles.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
0a715ffefca99f49f4d3812b843793d8efa3066e
20dabce57f9fd9a8f050dba8fd8a50aee4b7f814
2547ef1de82d6cd31bd6d78f604a87f9441f3fe8
381937aa00722b3c47dafcc5dabdbd4dcb1e57df
40f8a44638839ddb4361994ba24e83b23673c456
42fb4e7b26cfb3ab2154fd617af2e7ed0d82d938
878d52608bd60cacfa7d4677f4776c7bef37693d
9e2667cbdc449557a6d14e6fde4a606d336ff9ce
afd17e948d8ca6562f0ff8599051cb08787ea4ef
be93d863bcf31c9fb38e7975796048a8aa2fd031
20997681a78b58fd9bca1c5c3d1b65c3166a5010
6b932535d72a7e9bcea95ee0a8d6d6a933d35497
76bda4647b8901b950a782a4051fafe30124bb1f
95a006e0e23f43b50b02e4b7f1adf532b7e3cdbe
9cb79b52e353ddbd6638d5860a92c49de4e0c532
c71833874b838f2951a1386f7109e0bca460bca1
ede909e583e5becde0003f7962fca2d8b06acbe0
f0fddb227909c0d6fecda713a0e2ede225981dce
f433417505a4296254b329b997f13c3ba40161fb
f43f22396eb4ef5b2ec93cca6eb184b476cc79e6
5a640139e8e082974adb937ac93d72b5368d07b9 .env
9be1ea021a1db6dfd49e0ac44e0358fa523bc59f .gitattributes
bc92b29896c35b0cf4d3c4aabae7d83927e63ccd .gitattributes
c90eb68ef7cb62ce158b159070e9f846a74df063 .gitattributes
b4f6f07da98d4152b349a17fa3e778752411d66c .github
3625873eef4faf57e1b672579b5a6dfc3e4af387 .github/CODEOWNERS
73bf3d051b0f313b66482e1c7af21210a6e4e1df .github/ISSUE_TEMPLATE
404958c39a298d962e021c49a3092c51d9913f91 .github/ISSUE_TEMPLATE/bug_report.yml
f1f2653a5c3e1bfeae22cfebed485e8ae3c37f69 .github/ISSUE_TEMPLATE/feature_request.yml
7e5ad8b86cb841b80dd2d16ae707f0b7ef8043e5 .github/PULL_REQUEST_TEMPLATE.md
2afa082cf06241c8c48bb62d7e2ad335a96eef2a .github/config.yml
8e638ac2ad14439b406316f8f10c5072e6df0ff3 .github/workflows
5aa14ea8054ee7f5ffc0958e6a3e3e7d30495037 .github/workflows/parse_yaml.py
11a6151d7c22228a8b1cb945989dbc3dd816aa04 .github/workflows/run_tests.yaml
f7a904a89ce90d41dc98ed58443343dc649625ec .github/workflows/test_eval.yaml
619e4691a1bd5fc2a3368d1adab37dde20900b8c .gitignore
f651895b608f6cf946c1f50d8cdb6c1eec4b3c08 .pre-commit-config.yaml
d255bae252a52a20d0d2ceee42cacabf475784fa AlignScore
68bc17f9ff2104a9d7b6777058bb4c343ca72609 AlignScore/.gitignore
683e9185ff9a7b9be51a9a7492e87b9a8c3f6621 AlignScore/LICENSE
affaeb79b63fa46cba095559e6719c4f71d1bea3 AlignScore/README.md
853432cf87b7d9eef32c5267d8172bbbfdb2a91b AlignScore/alignscore_fig.png
8220b17beb50503a16b972c444ad5498da819f8b AlignScore/baselines.py
9a1236fc50891b31d11ae7f3fca8728401c827fc AlignScore/benchmark.py
76ef2d562ab3e42103449851932a9aaca8322e01 AlignScore/ckpt
2d3ce6582fc9ff4f150ac1dd0f3a4ae441de8b20 AlignScore/ckpt/AlignScore-large.ckpt
60683d502a11a8039875fde5a7e4f02006608b93 AlignScore/evaluate.py
7a1905cfe172fb557094958ae1d3b4b483799a73 AlignScore/generate_training_data.py
0781eeb557383f1a764ea510b4e280810901686b AlignScore/pyproject.toml
aee3c1502d7cc0e01d6a3b1f000d6e631dc5b986 AlignScore/requirements.txt
20605de3e637684db3e1462efbef0f0b535de3d6 AlignScore/src
0b970de293e4914f04d3db72db870f89215a8040 AlignScore/src/alignscore
f810e40272d14389ef625f28904e384fbc6c634f AlignScore/src/alignscore/__init__.py
3eefa7bfb73c61c56351ea183d45053198c2bb57 AlignScore/src/alignscore/alignscore.py
8989f6a5b63ce3f9b9d006027821ebde8fad28d8 AlignScore/src/alignscore/dataloader.py
6dbfdf293292fe9d7790ec1286241e6bb1128494 AlignScore/src/alignscore/inference.py
e8dca6bab791333e3c291aedf61baba01aed24be AlignScore/src/alignscore/model.py
2c14fc58992abb68fbeca37104c483f011afb06a AlignScore/train.py
405cc0eb38b5f0fafdf33bacacc3b16c14e3d2b4 LICENSE.md
a264d0a61ec9e2a4fb4bdf2821872e789dee8726 MANIFEST.in
bf7dfb5381d02b4893fd7cf87f9263ecdd06fcc0 Makefile
314b5bc3b755b6572619ff266b0e448d94d365fa README.md
8a3c8ccf4f03965d4ae126858fa874252c6a47f9 SECURITY.md
9bc8b584b8189c11cf65838089b75f8f8808375a __MACOSX
ecdaf1dcba95621c7d55bf57413d5c97e375da9d __MACOSX/._dev.json
520882a8195188ded2f65ac6aa0be338865cf941 docs
7c25dd6bc0d854b3ef2665712129dde6e24da7e3 docs/build-eval.md
2f9f6c039950fa52c7456c37b22f3eebb60a1c1f docs/completion-fn-protocol.md
7b4200ff2b3061ce2485e119edfc69aca255b230 docs/completion-fns.md
9e61fb647cb5a8dda0379163b5e361c0f38d2f42 docs/custom-eval.md
c6c90696d0ce08c950b0721aae91828fe7a4c1da docs/eval-templates.md
495acadb1844f04fe16cb5cbe6b54aeb86ee125d docs/run-evals.md
0e1fce1b6f96609b97439a785fbbb49318ab1f67 eval_bash
14944fd1b6b52d44ee2dfc8c984f66b2e977ff50 eval_bash
25ea431b0d9e32e899df26b4b71dd7b459caf0a0 eval_bash
877bf6702b1900a139995f3664a6ef8de7eb2b3b eval_bash
8c9494c65ac07b1fb18b9a3c81b6dbe387fba7d0 eval_bash
d72d0670158fe7faf61007670dba605e86c605ba eval_bash
fc1e28468209c4205e9b2f601f0f648903e404b9 eval_bash
f7fc8017729696bb10fc7ea07301513ccc954d8f eval_bash/agentclinic
a23c9bb17b362f04c189fdf7ea86e1a912e9ab4b eval_bash/agentclinic/full.sh
4af0c46adba66e5b1ec5c7caaafa6593e1dbe02f eval_bash/agentclinic/test.sh
4879aea00ba8a6668c0cf4f57eacf717f11612f7 eval_bash/chatdoctor
bbc6de4d64ada5afa6d542754f14bf2f8e510820 eval_bash/chatdoctor
c56fe6a621d3c5b120b8c5e5607eb806d6d2a63b eval_bash/chatdoctor
108a49495134e35f78c8a30c6709f9643baa418b eval_bash/chatdoctor/align3.5.sh
efc82f5dbe8ecb1a4c952fe62544eaede7558ac5 eval_bash/chatdoctor/sample.sh
42c26e39255f071818308daf6a84f633cd667f38 eval_bash/chatdoctor/sample3.5.sh
0870bcec4dea910fca01fc2f66a11602671eb2ff eval_bash/chatdoctor/sample4.sh
a13dcd85de6e2887762a1c866f46abb9493ceba6 eval_bash/chatdoctor/sample4.sh
454f49427e6d73d8edb9b4e2d6cc17a6976fe93c eval_bash/chatdoctor/sampleo1.sh
aa659dd83d477dd3a6789f95cd1c35db46b59d23 eval_bash/ddxplus
cdd1803759a71371cacd8e9aa8b7e01b85b48d0d eval_bash/ddxplus
6f848227640cf2a696ad3a5436fd2cbf17b09aca eval_bash/ddxplus/sample.sh
8246375480eacda5ca2c29b2c6eddf57c124b9ef eval_bash/ddxplus/sample_.sh
c1e5f458f88e7d9eef479bb3bf8f14e8485682aa eval_bash/ddxplus/sample_.sh
c4f5aed1fc410e609b9f6f8a146677b616bbcda3 eval_bash/ddxplus/sample_3.5.sh
ee94bd4bfbd1b7b7bef3563dd848e6840e028360 eval_bash/ddxplus/sample_4.sh
779fcb09ef7b5a51e6c4e7a35cdd6a60d2b5d714 eval_bash/ddxplus/sample_new.sh
1aa7f1f0cfe2aed28716434024fe406cc1c641f0 eval_bash/lancet
92bef3411d686d2ca3c1ee4635a5e48eb6359da0 eval_bash/lancet
da85262e3f1cd097eacaba91c41b0e7b931007d4 eval_bash/lancet/full.sh
3e352aa320f497f900a3ca9e68b5b39f15899792 eval_bash/lancet/full_3.sh
a43538c79ee2f0435c33e3f45f6313bfcf19eb37 eval_bash/lancet/full_4.sh
cdf2ae7becd04f37d534b860828a80677ec62c79 eval_bash/lancet/sample.sh
a1ece72cbd58b22296864ed802220c26356633c0 eval_bash/lancet/sample_onlya.sh
79ae0433f1b9464574fce595990408127504bb15 eval_bash/lancet/sample_onlya_3.sh
528118741d4a13a2807ecc30319e75bfc1bd5a83 eval_bash/lancet/sample_onlya_4.sh
c890754141f813bae6525bbea09e9fb17c653eff eval_bash/medmcqa
e439854748086538c4de943b3db3fb779a33c479 eval_bash/medmcqa/full.sh
22c846c1a8d63dffe08e88a0eb53e6e32e8346b8 eval_bash/medqa
eb300a95c97cfc31cafd7d5444c6e4b3d5961622 eval_bash/medqa
5f7195f9f35524f965cfcc621d0b847823c0f780 eval_bash/medqa/full.sh
efde86e046a3bf1b6c5b192c1abf8cb57d4f43b8 eval_bash/medqa/sample.sh
4735fde25ec0c6309cd2d87cebadf13f48db2f44 eval_bash/medqsum
2aa2818799557ed133e5116b3add1be0fd55dd9c eval_bash/medqsum/sample.sh
4cbd20c042d841715672e620b49465cb50d8387c eval_bash/medqsum/sample_3.sh
a9007f4dd9f0c4483a2bbacf9a4b92f924478e01 eval_bash/medqsum/sample_4.sh
a86ec2ea7174f7c4cfe74e6b5d0b98d016534d5b eval_bash/medqsum/test_3.sh
2cde71c1c7d7280d49ed098cfefbd2aebbe03aba eval_bash/mimic-cxr
73f45a0b4e2d67ca304c77f04be61d2ca07b4558 eval_bash/mimic-cxr
5aaa5b2b79d644ee87f97ea75295df9f11b3c758 eval_bash/mimic-cxr/sample.sh
d69186c21e7333645b82911ead8d603b6ce6fec9 eval_bash/mimic-cxr/sample_3.sh
3775e74dbbba43d570dbd3949c2b7a8470910261 eval_bash/mimic-cxr/sample_4.sh
771e7215d0df4eb5bfe72040b8c3bf754c5f1144 eval_bash/mimic-cxr/sample_4.sh
0c530ce3a66eb1a2d52e90eabec673fe7e89cee0 eval_bash/mimic-iv-ul
c9a6ec614ac63a659da764b6b3018ea26f5fa169 eval_bash/mimic-iv-ul
b60fa65b6255e6ad3d9109a1ff61bcedc6fe2fd7 eval_bash/mimic-iv-ul/sample.sh
21914c34b7ad9adeade7cc400b91f8949e56a03b eval_bash/mimic-iv-ul/sample_3.sh
9fcd2d0e396865980ee2a8821d7dcc1a53a2d814 eval_bash/mimic-iv-ul/sample_4.sh
ce57779456c798cd16b34ad98a4c94a7f3a92e2e eval_bash/nejm
bd332614268fdf4e9c0b8dd7d87c26d453a1070d eval_bash/nejm/sample.sh
e156796a0debf9e50588fd950e00b998157f3a5e eval_bash/nejm/sample_3.sh
a9e9f5c6272475b79230f79f8468adc74a725b00 eval_bash/nejm/sample_4.sh
4ec9261c4ccd611cbf30dd5e7bf5fd33b40c7870 eval_bash/pubmedqa
b931206c11fe82c9d81186590ec4ab8ca12ea721 eval_bash/pubmedqa/full.sh
e3d4f37c6ac142bf650811112e22d1df386c600e eval_bash/pubmedqa/full_3.5.sh
2231131c16c4acc15dd7f0bc200162df26ffd492 eval_bash/rct-text
5e3648bdbfe9b648d21214644d6d5cba153339da eval_bash/rct-text
44184eca29dedd90e3843bb2243ee37632591e67 eval_bash/rct-text/sample.sh
c78e6543c0abc36cd407ec8bb9672877a4a8c411 eval_bash/rct-text/sample.sh
4cbb4739ec0c796d7ec2cacbdbb86735032ed49b eval_bash/rct-text/sample_3.sh
6e0c0a1c3be2ea510e337ae62cb71760fa60f0da evallogs
7d4a81cbf70412869981dbc9501d7840e69e232d evallogs
96451049dc451806051af5f6a9368f76c72bfff9 evallogs
a99979e6054df7554d84868fd50fb4cec8348ed9 evallogs
c321adc20edda61402500661a7f2a307cb2fb4b9 evallogs
ebf2acc7f3fb567deef66d0c1d3df8b93be1dd3b evallogs
97ce6a47704a099699cd2baa0c3f87311f7e30e1 evallogs/240914010804U2ASD66M_o1-preview_pubmedqa_full.jsonl
0e7feba6bf85d84a829629f97e355f69fffbfc3e evallogs/240914010830HNBT4M42_o1-preview_pubmedqa_full.jsonl
52edacd79504bfd44b9fb408148b5fffb74db96f evallogs/240914011333Z6GUOAMR_o1-preview_pubmedqa_full.jsonl
2c91086743296ea4d5703203022ab961650723b3 evallogs/2409140122285CV7YLBW_gpt-3.5-turbo_medqa.jsonl
bb05a9bf49d66fa6c20dc7facbc8c468ac9208d5 evallogs/240914012435PI633RCY_gpt-3.5-turbo_medqa.jsonl
519bdd8f93b3b30cdacb7a8a2d7c4f3794ce413a evallogs/240914013147GS65CN6P_gpt-3.5-turbo_medqa.jsonl
5399c204dcef0b66df6c4982517d6f38e57e74c3 evallogs/240914033817YJZRI5CX_gpt-3.5-turbo_medqa.jsonl
5161e763cf0800bbb2b65b197d909da78812c36f evallogs/240914033834DVSTUWEP_gpt-3.5-turbo_medqa.jsonl
92f400421605f0f6417fe94bd0aa9be78ae6c730 evallogs/2409140341104EFIH3AK_gpt-3.5-turbo_medqa.jsonl
b0b3bc30537f877715b52a0641b7e657973de797 evallogs/240914034440ZYVCLK6I_o1-preview_pubmedqa_full.jsonl
ff27da6337dd36258353054e0bf358553153083b evallogs/240914041016HYM7NDUO_o1-preview_medqa.jsonl
1e2ee404692de2eedd9721b2348784c83d67b568 evallogs/240914041800SLCY7R2E_o1-preview_medqa.jsonl
fd0d7f816f3c6969f80ed7a3e18860793a1caba0 evallogs/240914042645LKVW7PCB_o1-preview_medqa_full.jsonl
2c98ff608083ffd2f83e8c53744ccaae22d84f93 evallogs/2409140558245KNEATKC_o1-preview_medmcqa_full.jsonl
4c0254e07f9d2bb177e0215ab4fd189674d43d05 evallogs/240914061113Y64ET6BA_o1-preview_medmcqa_full.jsonl
3b1d422e8433c0687b5b070140522de9b48ef26b evallogs/2409141905322QVXHS5Z_o1-preview_ddxplus.jsonl
5b8613f4c3d00953683badb63f187b74af0c0520 evallogs/240914191555OBYLEJZG_o1-preview_ddxplus.jsonl
54adf9abe3764b34b806bbc65f3b5f9fe4e437fe evallogs/240914204243KHAWSJSX_gpt-3.5-turbo_ddxplus.jsonl
e09f857e8147fdd7ec7ae75045ee88434b61fe4f evallogs/240914205534NKNVVNRX_gpt-4-0125-preview_ddxplus.jsonl
60d4312d56bfe7777df426a183f06cd761d81d1b evallogs/240914210854VNZFU5VZ_o1-preview_ddxplus_new.jsonl
a9cba1385f0614de18214d2626f95b745ff38b70 evallogs/240914211231BVEUK2ZX_gpt-3.5-turbo_ddxplus.jsonl
8472b2c7d0191cb45b1b88b58dd36820924f0daf evallogs/240914223849RNKSQEXC_o1-preview_ddxplus_.jsonl
272ee42503adb7cd840c09470956d9f0bc4e5752 evallogs/240914223904VFA3ZKAT_o1-preview_ddxplus_.jsonl
d5fce9e59094fa4a4221c4016d53e873d1c4def4 evallogs/240914224049YNYLGPMP_gpt-3.5-turbo_ddxplus_.jsonl
026a0a03385975e577a56da61390d900267831c4 evallogs/2409142247286QKIHBUV_gpt-3.5-turbo_ddxplus_.jsonl
3cba17b400848f65efad0591368879a311467b26 evallogs/240914230517ZB2GA5BS_gpt-3.5-turbo_ddxplus_.jsonl
dd952b062342e71394e4f6802c7063ea99148ca4 evallogs/240914230536XCBWYBIF_gpt-3.5-turbo_ddxplus_.jsonl
0d0ea633338054d880171488827db027f450923b evallogs/240914230611ZHC6DXPY_o1-preview_lancet.jsonl
72234911f9f15165552b6e35df5f0d3fa8e821c9 evallogs/240914230713DHY3UJ3G_gpt-3.5-turbo_ddxplus_.jsonl
ea49db2b679650a31510de3002d708b570a04c02 evallogs/240914230923K4R4W7O2_gpt-3.5-turbo_ddxplus_.jsonl
6a43b2bc87fe0b00fdb3929b7fa4d88ce8c62a04 evallogs/240914231011Y7UP7ZPG_gpt-3.5-turbo_ddxplus_.jsonl
08584d85e8c6d44a2317b767c8edce926685c991 evallogs/240914231324SHUMWSFN_gpt-3.5-turbo_ddxplus_.jsonl
56c47da44afaf9c22843de9b3badb1f9185e9c80 evallogs/240914231557EVCBOKA4_gpt-3.5-turbo_ddxplus_.jsonl
a10461858e355446afe7ee8fadff3b53556c3479 evallogs/240914231728HPKSCPIF_gpt-3.5-turbo_ddxplus_.jsonl
24c70004a691657957a096624be732dfaaf5ccf2 evallogs/240914232109I3M7S3GC_o1-preview_lancet_onlya.jsonl
dd4b3f159ebc57561686b23cfb9e0726ed919549 evallogs/2409142322364V27XKMB_o1-preview_lancet_onlya.jsonl
b5de8e775d99e516b73ab023622174be1c33380b evallogs/240914232338EMPTQG4E_gpt-3.5-turbo_ddxplus_.jsonl
6351b17a1492649049b5ddb23b80e8ceffe83435 evallogs/240914232730CGRYNJY3_gpt-4-0125-preview_ddxplus_.jsonl
5d603dfab6c56caef4c46779ed12aa901e24fed4 evallogs/240914232845P6VIHHAA_gpt-4-0125-preview_lancet_onlya.jsonl
bf6b8a0e020ca518623886216fd881d482158f42 evallogs/240914232905N52MN6FL_gpt-3.5-turbo_lancet_onlya.jsonl
d7a177a4dfde92c8720c43121722cd843a144cc1 evallogs/240914233008TSYCUHGF_gpt-4-0125-preview_lancet_onlya.jsonl
6dc84a4a6bb5d8c4afb354152e8fda076dbbaccf evallogs/240914233201YCC3VFKK_gpt-4-0125-preview_lancet_onlya.jsonl
19f55138ad5a6380ea45800762387d4d4e51650e evallogs/240914233216OSWWLASC_gpt-4-0125-preview_ddxplus_.jsonl
946c41b8067d0088d7c3cb314b138722c54d5442 evallogs/240914233236LQNG2RGE_gpt-3.5-turbo_lancet_onlya.jsonl
690c060ff8a117e6fff2f70c912a136497d6bdfa evallogs/2409142344396DPNEKO7_gpt-4-0125-preview_ddxplus_.jsonl
5f9c2426648d49a0875a96b0b068a99ef63a86a7 evallogs/240914234612EPVFXXDB_gpt-4-0125-preview_ddxplus_.jsonl
d94d889a38021974d85f70202bedd0b8e38239a2 evallogs/240914234911FSKFGGGN_gpt-4-0125-preview_ddxplus_.jsonl
ec7f0af7f16ba7dd10f71adf95f3d0c0f6d68720 evallogs/240914234947AJ6XD2TU_gpt-4-0125-preview_ddxplus_.jsonl
a80dad4a27832cf47a067c1532ab2ab07a039fca evallogs/2409142350083GEYDHQF_gpt-4-0125-preview_ddxplus_.jsonl
5fc285b56ab0fc31a58aa566ce9a4dfa6633511e evallogs/240914235040UI4RJCSL_gpt-4-0125-preview_ddxplus_.jsonl
9c159f4a32bc5871a8a7b4498cc24c71f7022cd2 evallogs/240914235842NL3GQJK2_gpt-4-0125-preview_ddxplus_.jsonl
9f1fca8da2e86e4327bbdbea6794d2fc7d247ff9 evallogs/240915000139QC7SCAPV_gpt-4-0125-preview_ddxplus_.jsonl
52808c918ac4c320fe3cd50d4bb03e02af3d9231 evallogs/2409150001543MZTK32B_gpt-4-0125-preview_ddxplus_.jsonl
e111fba19a5d6fbfea68e61a1b46c3324cc3448c evallogs/240915020856FBMU6TWL_gpt-3.5-turbo_mimic-iv-iu.jsonl
00d8cb8ed23acf57cf51feb0241246a07de4219a evallogs/240915020953WLSMF2DY_gpt-3.5-turbo_mimic-iv-ul.jsonl
9c3d3177c5c4011667dfd517d669fca9741eb15f evallogs/240915023409BEEW6WDQ_gpt-3.5-turbo_mimic-iv-ul.jsonl
d2d32c2f609c32d381c2fe529a062f0f6fc3f611 evallogs/240915023524O4ILD7NH_gpt-3.5-turbo_mimic-iv-ul.jsonl
25d83632b6b35296addcf273cc7c1c50dd5d3787 evallogs/240915023628NX27H7HZ_gpt-3.5-turbo_mimic-iv-ul.jsonl
587d329c1c727c46f50ca89e67c7095c79099ad7 evallogs/240915023657HCZGWSGC_gpt-3.5-turbo_mimic-iv-ul.jsonl
5516e8b98daa782f35f22b28354a59d56bc28d5a evallogs/2409150253316K6KCIOE_gpt-3.5-turbo_mimic-iv-ul.jsonl
2b3d14ba2175feaf00185a08c83cc0bdf2b1a2fd evallogs/240915025435SBLBV5QT_o1-preview_mimic-iv-ul.jsonl
f4772ea787378f3e4ff1154c5c7442cf9c9da209 evallogs/240915025435SBLBV5QT_o1-preview_mimic-iv-ul.jsonl
b851f1dc0d8388d72855536b879879bf0051f2ad evallogs/240915025526QBBK46SE_gpt-4-0125-preview_mimic-iv-ul.jsonl
4c3988da247fed320b8c924424890e280776c68f evallogs/240915025928DBKGQDLZ_gpt-4-0125-preview_ddxplus_.jsonl
7db5fc92fa3cfd96db8dc075ae267fb3554c7d0c evallogs/24091503014037XXKQU7_gpt-4-0125-preview_ddxplus_.jsonl
9d418af3167cf79a614d3c4aee48803d5430d4b8 evallogs/240915030701XPGFNE35_gpt-4-0125-preview_ddxplus_.jsonl
437114d8cd94a66f310fb346de19173a694c3c6c evallogs/240915032551NM4APXHN_o1-preview_chatDoctor_2.jsonl
3c656a97a559559cf7e7e7304115751086110456 evallogs/2409150327502FZESHLQ_o1-preview_mimic-cxr.jsonl
9f88cf55b7f5a8a3c7399083e1a7282ca6e28112 evallogs/240915032814HBEUOSLY_o1-preview_chatDoctor_2.jsonl
96f67d19c9a6988513f2ffda6b421c3c0cb8d73d evallogs/240915032929NCZ3CGNC_o1-preview_chatDoctor_2.jsonl
28b32a03912d2d8cb1a094cf822e47c9e83803e5 evallogs/240915033642HBKQR6LA_gpt-4-0125-preview_mimic-cxr.jsonl
0eb04e2d0ac75674a4909ff141f5688c1596169a evallogs/240915033833HLJEXRIV_o1-preview_chatDoctor_2.jsonl
1e46a325169289022bfb8a1df20f931ddd6cef79 evallogs/240915034343ODWPJO66_gpt-3.5-turbo_mimic-iv-ul.jsonl
fef8912d753d112bb1d620ca720297c8846636cd evallogs/2409150343542VQNU33I_o1-preview_chatDoctor_2.jsonl
be2a17308721bcff4d5521a266b3b14e48f2c424 evallogs/240915035617W4WSM7FU_o1-preview_chatDoctor_2.jsonl
544930e288c8f0c2cc2279bfa19ae7f15144eaea evallogs/chatDoctor_2_2024-09-15_04-06-10.log
e63a7c76a6111ed5cd1cfbcb7dae3819362b373c evallogs/chatDoctor_2_2024-09-15_04-13-09.jsonl
590472a8c90329012d5a26a6eeb08c29a30af6e5 evallogs/chatDoctor_2_2024-09-15_04-15-26.jsonl
a554c08410641a8f25acac60cf515ff857a7116d evallogs/chatDoctor_2_2024-09-15_04-34-48.jsonl
9faf5dfed99486d3954830cee52debf948db3f8f evallogs/chatDoctor_2_2024-09-15_15-05-02_o1-preview.jsonl
793c1fdb85d07b66690724cca05194898841fb7a evallogs/chatDoctor_2_2024-09-15_16-39-19_gpt-4-0125-preview.jsonl
4e0fd73b36a4235df17a5bed9903ad854db31a2f evallogs/chatDoctor_2_align_2024-09-15_05-36-34.jsonl
c66ca41ad3d4d88db5a99b106afe5e4087a26bb1 evallogs/chatDoctor_2_align_2024-09-15_05-40-39.jsonl
de9d7fb6e5c5c5a81c8ef13254a8b23f8159a14d evallogs/chatDoctor_2_align_2024-09-15_05-43-25.jsonl
3f7fc154cfd2e874f1e9279eabc76a2369cabb4c evallogs/chatDoctor_2_align_2024-09-15_05-44-55.jsonl
bd8fdbdcb9c57675b6fd53c7bdb5dffda064eec5 evallogs/chatDoctor_2_align_2024-09-15_05-48-44.jsonl
3605f138d46e2a039925dd0e6b9f886f23dce559 evallogs/chatDoctor_2_align_2024-09-15_05-53-32.jsonl
d4aa711aab84c5ac2963187aaca6ca8c39e11b3d evallogs/lancet_full_2024-09-15_15-28-03_o1-preview.jsonl
f57785d7b9d61811c17fec2373e58ee5f024ea6c evallogs/lancet_full_2024-09-15_15-43-00_o1-preview.jsonl
049fbd68212f5bc75040702e693eb3cc6f56454c evallogs/lancet_full_2024-09-15_15-54-33_gpt-4-0125-preview.jsonl
8f7bd0906287755021d152bc109a53c4d42119cb evallogs/lancet_full_2024-09-15_15-56-44_gpt-3.5-turbo.jsonl
02ba658b088582edc0314fb03c99b4bd0809db88 evallogs/lancet_onlya_2024-09-15_04-26-41.jsonl
dded40a949b1a4dd9c1e0673b35c1b694cae284e evallogs/lancet_onlya_2024-09-15_04-33-18.jsonl
ede9dbcf4d42a5accc0e6890ba6fb3e399c70365 evallogs/lancet_onlya_2024-09-15_15-22-57_gpt-4-0125-preview.jsonl
8ce9143842898ffef3b94e49c96843f8f48c4edd evallogs/lancet_onlya_2024-09-15_15-23-41_gpt-4-0125-preview.jsonl
2111ee0d4a3079e832fe9b09f6bd396d6c87f9f0 evallogs/medqsum_2024-09-15_16-11-19_o1-preview.jsonl
faf9e3b6e2694359ff04d4d59eb155e3c940dc7d evallogs/medqsum_2024-09-15_16-17-44_gpt-4-0125-preview.jsonl
9c2449794ad2521ddfbced126cac96ba254b7fa2 evallogs/medqsum_2024-09-15_16-19-02_gpt-3.5-turbo.jsonl
66528344a5b1641941696d06f799625e31fdf13d evallogs/medqsum_2024-09-15_16-35-47_o1-preview.jsonl
325d9ce610d8c551185a3d0ea40b633836ccf85a evallogs/medqsum_2024-09-15_16-41-43_gpt-4-0125-preview.jsonl
246b32e603dd0606a357427d50de82089ffed968 evallogs/medqsum_2024-09-15_17-24-58_gpt-3.5-turbo.jsonl
b73e9937c23eca353f535173d7c850dddc2b1b7b evallogs/medqsum_2024-09-15_17-29-44_gpt-3.5-turbo.jsonl
6b17500975217d9f2d8678d9fd298aee16df2150 evallogs/medqsum_2024-09-15_17-30-22_gpt-3.5-turbo.jsonl
d8ae2fd79d0d61087b7f39c1d31c9b4427cf60c7 evallogs/medqsum_2024-09-15_17-35-43_gpt-3.5-turbo.jsonl
241d46bd6b9cb9673625c182ae1a253ab6137a76 evallogs/medqsum_2024-09-15_17-36-51_gpt-3.5-turbo.jsonl
446da967944860fb34e6a4f484846ef067fee4cc evallogs/medqsum_2024-09-15_17-38-28_gpt-3.5-turbo.jsonl
03d4ce93160c2643a6fb9cf5f2ae5e35b03e046e evallogs/medqsum_2024-09-15_17-39-48_gpt-3.5-turbo.jsonl
18fdc24edeedaf1d0d43d65c477463a66def1eba evallogs/medqsum_2024-09-15_17-42-19_gpt-3.5-turbo.jsonl
d4442b6495f57efb8486285f203b6d64fd5a5893 evallogs/medqsum_2024-09-15_18-20-28_gpt-3.5-turbo.jsonl
c65d906d7b0e0a9ad6ee67b636404acaf8ee67a8 evallogs/medqsum_2024-09-15_18-25-54_gpt-3.5-turbo.jsonl
412c9ca2c989ae55f92a4fdc54b60ab29e432768 evallogs/medqsum_2024-09-15_18-26-01_gpt-3.5-turbo.jsonl
ed835818ff4bd7fcd4409acc42182996465b1a84 evallogs/medqsum_2024-09-15_18-35-28_gpt-3.5-turbo.jsonl
080ea310fd97bdc9c68f2d352fa17a7e05fbd5e3 evallogs/medqsum_2024-09-15_18-44-36_gpt-3.5-turbo.jsonl
4a9b652b5baca59a58dacdb68ea2d5bf14f04018 evallogs/medqsum_2024-09-15_18-45-46_gpt-3.5-turbo.jsonl
d8a184fc5407abba5e33fda24626b1d4d539f6d0 evallogs/medqsum_2024-09-15_18-46-22_gpt-3.5-turbo.jsonl
2a878a016a2e61f7a15e417f3cd772c022e585a3 evallogs/medqsum_2024-09-15_18-48-34_gpt-3.5-turbo.jsonl
de1d4e2c4a8448db31148c753aa2f434fcd2bff6 evallogs/medqsum_2024-09-15_18-49-17_gpt-3.5-turbo.jsonl
91a58e41f0cac30b9b01b6e60dc4b25235e92c36 evallogs/medqsum_2024-09-15_18-50-56_gpt-3.5-turbo.jsonl
2a18e1e67832aef4a4f45c5ec83c9c93f8ad4152 evallogs/medqsum_2024-09-15_18-53-10_gpt-3.5-turbo.jsonl
675289a91f229a009649480ea68de1f0de9e0705 evallogs/medqsum_2024-09-15_19-31-24_gpt-3.5-turbo.jsonl
773a45fa35460ca989172a95d10e977591bfe62d evallogs/medqsum_2024-09-15_20-29-57_gpt-3.5-turbo.jsonl
f2e3e6ad754a361a7f1cb16b800cdc27b583d706 evallogs/medqsum_2024-09-15_20-32-54_gpt-3.5-turbo.jsonl
103dc3b9e1f4497355b752cd2cef21969a12ec17 evallogs/medqsum_2024-09-15_20-33-50_gpt-3.5-turbo.jsonl
a6aa460b2b2524500592bd3fe408d823ddb88fec evallogs/medqsum_2024-09-15_20-47-11_gpt-3.5-turbo.jsonl
da64dbdfbd5ef60dc1264ada38db279887767019 evallogs/medqsum_2024-09-15_20-47-18_gpt-3.5-turbo.jsonl
0984483b379158bfb77167a0a748efe638d5dc86 evallogs/medqsum_2024-09-15_20-47-34_gpt-3.5-turbo.jsonl
c8553c7d7a2d0092a4099337741ad9dd7e2ec093 evallogs/medqsum_2024-09-15_20-56-43_gpt-3.5-turbo.jsonl
4fb1d2a74a3d937447cbbce976e41d7daf4b9fce evallogs/medqsum_2024-09-15_20-57-12_gpt-3.5-turbo.jsonl
bf5825c9039cf4880936ccdeac08bb734ab4d519 evallogs/medqsum_2024-09-15_20-58-21_gpt-3.5-turbo.jsonl
7ef9034c5a8cb6b8976386924e1baf124e1383eb evallogs/medqsum_test_2024-09-15_21-13-07_gpt-3.5-turbo.jsonl
f5c63e0894cbe268a210e0dff979f630ff44e71e evallogs/medqsum_test_2024-09-15_21-15-27_gpt-3.5-turbo.jsonl
4365aeef8a75507857162992cf1ebd3ad0ace411 evallogs/medqsum_test_2024-09-15_21-18-15_gpt-3.5-turbo.jsonl
833f422ad01b50e4f8f23a099f76848acdb1a2d7 evallogs/medqsum_test_2024-09-15_21-19-39_gpt-3.5-turbo.jsonl
a62719405be37d375613cd3fce5fb4cc4d56c0de evallogs/medqsum_test_2024-09-15_21-23-28_gpt-3.5-turbo.jsonl
9535bd7b94d5870027b2e50c86cf350a81ab2592 evallogs/medqsum_test_2024-09-15_21-25-35_gpt-3.5-turbo.jsonl
8378da4c50dffe42421461bbe21b448c937183c6 evallogs/medqsum_test_2024-09-15_21-26-15_gpt-3.5-turbo.jsonl
b63088b4579e6cb20202349b1b2d04fd90b1b1da evallogs/medqsum_test_2024-09-15_21-26-29_gpt-3.5-turbo.jsonl
0389f03ba0a616771a7814de5ad01da8204b847d evallogs/medqsum_test_2024-09-15_21-28-05_gpt-3.5-turbo.jsonl
d147aae262b9ce58a12d48575f711bda481eae61 evallogs/medqsum_test_2024-09-15_21-30-22_gpt-3.5-turbo.jsonl
406fae1e4f5f25726a65e5b0b1741a432775cfa1 evallogs/medqsum_test_2024-09-15_21-34-40_gpt-3.5-turbo.jsonl
d9d6f6a97aad27907f5f0008dc3f064923d1e688 evallogs/medqsum_test_2024-09-15_21-36-53_gpt-3.5-turbo.jsonl
6ab5187f1562a6003ddcf2669aec424bd0396775 evallogs/medqsum_test_2024-09-15_21-38-31_gpt-3.5-turbo.jsonl
aa790fcbf028371c1a3b5d350c9ccafffe29378f evallogs/medqsum_test_2024-09-15_21-40-07_gpt-3.5-turbo.jsonl
810dd937cc24edd7f139ece21f7385058fe02ac4 evallogs/medqsum_test_2024-09-15_21-41-35_gpt-3.5-turbo.jsonl
d33c44fd7ae5a0303e86c2b697bced74d01ad11b evallogs/medqsum_test_2024-09-15_21-47-04_gpt-3.5-turbo.jsonl
0aa3bff04325c4be99f51de216245e9771ecc04e evallogs/medqsum_test_2024-09-15_21-48-36_gpt-3.5-turbo.jsonl
8080c6e7eeec8b79e0ce9e6893e55adcee2165d0 evallogs/medqsum_test_2024-09-15_21-49-44_gpt-3.5-turbo.jsonl
3681d875db6fda654bfa3730f10f08f7b4a11343 evallogs/medqsum_test_2024-09-15_21-50-45_gpt-3.5-turbo.jsonl
a6439fa3695bc751c32945aa421868c6eb45c566 evallogs/medqsum_test_2024-09-15_21-50-45_gpt-3.5-turbo.jsonl
4e8dc2e883653a4e89c4aa2f33ccbba9d9ad6f01 evallogs/medqsum_test_2024-09-15_21-52-25_gpt-3.5-turbo.jsonl
33ec971cf167823c925f05364f0f4668c56455fe evallogs/mimic-cxr_2024-09-15_04-34-49.jsonl
15748b46e62811e25f3b1057a3204536dc179f2d evallogs/mimic-cxr_2024-09-15_14-49-17_gpt-4-0125-preview.jsonl
7d7876d2995a65cf35c0bf10ee01d92cad4a5006 evallogs/mimic-cxr_2024-09-15_15-18-00_gpt-3.5-turbo.jsonl
e76d38dfb67973522f07a5048192674c03a21767 evallogs/mimic-cxr_2024-09-15_17-49-16_gpt-3.5-turbo.jsonl
8c6b5faf073a55dbcf85311ba3c64f86d589da0e evallogs/mimic-cxr_2024-09-15_17-52-24_gpt-4-0125-preview.jsonl
9b40cd2b7fbcf538aa715ef693b49bc9aa80434b evallogs/mimic-iv-ul_2024-09-15_04-38-25.jsonl
990fce8aa36e76a23b04c4b167d04c428f681840 evallogs/mimic-iv-ul_2024-09-15_15-17-17_gpt-4-0125-preview.jsonl
0c6ac289b776fac253fe5b100878b7abd78431e8 evallogs/nejm_2024-09-15_04-53-10.jsonl
4e64186a8a3fa7bdc0ab7268f8b25fa142da35bb evallogs/nejm_2024-09-15_04-54-01.jsonl
7519b49435b5263a8e6fc057ddfc8ec63fe1609a evallogs/nejm_2024-09-15_04-54-25.jsonl
5ff5482330f51a964bbacb6a9c6fb026d382e7aa evallogs/nejm_2024-09-15_15-58-27_o1-preview.jsonl
03831666d3e6aac6fbd005d8e26b6769ac695aee evallogs/nejm_2024-09-15_16-04-27_gpt-4-0125-preview.jsonl
783c59e3f1892c9f5ca0aee1bb9c16bd04d746da evallogs/nejm_2024-09-15_16-06-06_gpt-3.5-turbo.jsonl
77b806bf49533b4206f2abc232f79db6816096df evallogs/rct-text_2024-09-15_04-06-19.jsonl
24ae91a437a2d2ce1d7d16667729a9cea67d070d evallogs/rct-text_2024-09-15_04-11-57.jsonl
dce2090a2cf7114d73b639f279cec7a6458ca620 evallogs/rct-text_2024-09-15_04-13-47.jsonl
1ab77ba4cfc1aa9bdc66598c962d6f2333d3e7fb evallogs/rct-text_2024-09-15_04-21-53.jsonl
ddbd2e6c94abb2e01b8e61528a1d1d39aed2b75e evallogs/rct-text_2024-09-15_04-21-53.jsonl
5d2a403e666eaf4ab951a87e10decf585027c31f evallogs/rct-text_2024-09-15_04-29-27.jsonl
f7b4f2719ce3097908f1c98038958a7954b1b6b0 evallogs/rct-text_2024-09-15_04-36-29.jsonl
ae6fcae844f9f7f921a21cd88d39cd5b92d172ad evallogs/rct-text_2024-09-15_04-36-35.jsonl
ae63a40cfbc1e65fc8654234f85a454c5468f77d evallogs/rct-text_2024-09-15_04-43-15.jsonl
a8555643d68062258bbe2488e0fdf38495344383 evallogs/rct-text_2024-09-15_05-05-43.jsonl
dfc12f06f07cdd3dfc046d4a3912090df03c3846 evallogs/rct-text_2024-09-15_14-45-05.jsonl
8028b4adaa4353da3642e73a1c742a7811d6165c evallogs/rct-text_2024-09-15_16-01-29_o1-preview.jsonl
09065596efdf679c06fc8ac082109e20d2b219fb evallogs/rct-text_2024-09-15_16-05-28_o1-preview.jsonl
56a61edaf78bfa5a7f396b56d38c1b66f3b1aba3 evals
58f918963c8e9ddd2fc3a59d516bd993e6d0aa99 evals
5a79a94f568118e9fa2b3d77b6e97e3e6427d629 evals
ade89f72bd99df6cb354f580f6db687be0088581 evals
d01e6aafe087cba4fd56531236b17d03479eca69 evals
d082dc78d24ec72836017b55c075594ae0221414 evals
eb577b076bdc27bea6be0f65c8edeaf6825f97a4 evals
f6e7d9fc429f0d0d8a91162a824203c6a3267ca5 evals
dfd580f8257c640f18f1de5dbb088643f50ff998 evals/__init__.py
5dedf95d928aedfbebd1725a67976ad75c6499d4 evals/api.py
62d015833d8139d791f33f7a7c2e694dcfc9a360 evals/base.py
16b5f36dcd996efdbb13464d332c7e9dc108180a evals/cli
2d19ce670c539b17f87caf4cd1dc272aad810c3d evals/cli
36dce4c5e8311a8c35afb1d5c2496a15b2e9f611 evals/cli
7cf33c5bbee10f7f6ef599abfce1bb8db5fbe3eb evals/cli
fd7415cccfc1a20cb1fb82565e0c8d924e57dc60 evals/cli
45fe5871df653c410774f84b5e4493036b0c7021 evals/cli/oaieval.py
c1addd43bb419cb96b422be4bd220f7ad0eeddc5 evals/cli/oaieval.py
c31f5ee1780751eb6df8a276c1069d64172c04c1 evals/cli/oaieval.py
eef2b6e5ac79f2ab6629c13192fc01b93a4256ab evals/cli/oaieval.py
52e6e69001a234f792b0c660c7410a31a220c08e evals/cli/oaievalset.py
ee5a163dde72a33c6f6e0d1479f98ba4c197804e evals/cli/oaievalset.py
080cb9bc516dbe994bd05d5f63ddfe148a990672 evals/completion_fns
e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 evals/completion_fns/__init__.py
f0a6472f3afb1f3f0e5fad85dedf45b637ea2328 evals/completion_fns/cot.py
20e96b74fcec9a43ef7b24d32f2a0a5d7e28b26d evals/completion_fns/langchain_llm.py
9f986bb1a43258567497d17fc30319e40fa594e4 evals/completion_fns/langchain_math.py
21524bfc1aa1e4654741d30fc3bc33cc958ab726 evals/completion_fns/openai.py
6ef998600c0e7fbbcf0123758036fa05adfa5a13 evals/completion_fns/retrieval.py
c6c72d7361333367dc8f858447046d960d7987c9 evals/completion_fns/solver_completion_fn.py
448cd5e3a9dfcf6bf2e00ffb3018e3d6e7baeecb evals/data.py
2a22d82b4dc94f9012f666abdb558a24d6a79d25 evals/data_test.py
1762ed71fdd419dfe595f2c82f33603870b6c062 evals/elsuite
852a8612eb351b5d97eff384d867671608944b73 evals/elsuite
a1e5ac1356f0fa6f1bfd8cd1ac3f398ba6f3f7a2 evals/elsuite
a2a1f69e168556f10d1759f0bf16a0d6a6f3104f evals/elsuite
a6c5aa7171a8a99b47f59fbedc5b8f5754469d00 evals/elsuite
c76624b6937da89364cdcde8b13bfe3d262eff60 evals/elsuite
d1f7b3147512b6bae24510fb7767876a5a0336bd evals/elsuite
ef6e53113f840859ef006e1c46612ceb7e3cf503 evals/elsuite
1d9d2377feed2a6f16a6a8427ce87608d037bbe1 evals/elsuite/already_said_that
bdb5274b1e67e0efaf8e35af8a4a0cbe0ce0c286 evals/elsuite/already_said_that/README.md
712d172ece22c76dd1d4f7c350b4fc43a70ab242 evals/elsuite/already_said_that/distractors.py
2fa495c7022c46bfd3875fac14f40ab01219c80f evals/elsuite/already_said_that/eval.py
cd3bbb4490433df33e80d688a38ccb0543e49e1f evals/elsuite/already_said_that/prompts.py
b1c75075d64c453a3ad698124ea8139adf88d8d9 evals/elsuite/already_said_that/scripts
03c88ece889ac1d652fff5df6da99bb7ec456e90 evals/elsuite/already_said_that/scripts/data.sh
94f827d0d668972fa959d3f64db0edcecd9e771e evals/elsuite/already_said_that/scripts/gen_data.py
ede36291ec68e2e6a5e0ef3ef82ef7943b6af3ec evals/elsuite/already_said_that/scripts/make_plots.py
dd300f61417ef3fc121b8be617fb7f24e6acb81a evals/elsuite/already_said_that/scripts/run_experiments.sh
5eed8c84a6e72c8175074b2802303f18a21f3991 evals/elsuite/already_said_that/solvers.py
d565274cbb48ca6800e867983824c707727c295a evals/elsuite/already_said_that/test_distractors.py
f535fd97082dd4369e0f848d8cd7df55baf5e4c9 evals/elsuite/already_said_that/utils.py
48cc00cb08c788798f9fdfff1a9fad0cf6dbdc58 evals/elsuite/ballots
67c44567b62fd3ce6e64f37ae5b6cf6c95318143 evals/elsuite/ballots/eval.py
bb698cb2442bdadb8f9e75ec5c971c1e81cab2b5 evals/elsuite/ballots/prompts.py
169490bed545d048d00e085fe522499979ec35fa evals/elsuite/ballots/readme.md
9bec4f5440ab4a4796d6d2b02dafdf1fb2a0a5fa evals/elsuite/ballots/scripts
22cf3bce992515372bc511279daf0c3830b112d9 evals/elsuite/ballots/scripts/make_plots.py
df36bdb5c6f69d2ec8699e9e463529614d5a8e8e evals/elsuite/ballots/scripts/run_experiments.sh
95e610f44459cf44aa0db0da661bcd6e97529286 evals/elsuite/ballots/scripts/toy_run_experiments.sh
35050adf3c2f4667103f96dd391f422e689a81d6 evals/elsuite/ballots/utils.py
1f177fe6651f3898a058c794949936267a2b3e1b evals/elsuite/basic
35d08ecedbe9264cf8152ad7b6e3c364c4d65f02 evals/elsuite/basic
45194cca3409ee79746c5416b57f080d4620f97e evals/elsuite/basic
879bfa8780dc8adf8fd8ef14d2ecb3375d2e1d8c evals/elsuite/basic
b0e8735da31ec6807a535a6fe1404f24565e66a7 evals/elsuite/basic
c719ad606d101a068933e4d91a2f625acb819abb evals/elsuite/basic
e944eaec975aac21273bbc3dfae524f9c3ff8c40 evals/elsuite/basic
f936e4a8d86d98765464be74f7211e1dc426631c evals/elsuite/basic
4596b89f4606206c97bbebedd6e8436f2d309724 evals/elsuite/basic/fuzzy_match.py
ba23a7193914b58c55bcb11cc295666374b80a7c evals/elsuite/basic/fuzzy_match.py
ad129c1209325bc5df53c569e2b1ccd03723c0aa evals/elsuite/basic/fuzzy_match_test.py
f435ff7c2c5342a8f493ef0929f84db21d0433f3 evals/elsuite/basic/includes.py
f4a8c960f02af2a18c9eed628bfe1a4fb98219c1 evals/elsuite/basic/includes_test.py
dfaa00a51a56b283d705c4f8f8bd8361356b0276 evals/elsuite/basic/json_match.py
84d3cdd7329ce16795cf260ffed352981ad7aad6 evals/elsuite/basic/json_match_test.py
11ee28caa2cbde2d42db7507766e861e926c45fd evals/elsuite/basic/json_validator.py
b74202a2fb6d3e6dbc097affaa34dbdf27cd340a evals/elsuite/basic/json_validator_test.py
7338eb59c37b78928274f7e810580f67b48f8302 evals/elsuite/basic/match.py
b49ae2a5fd1064661cedcd4d8eba4d718a61a486 evals/elsuite/basic/match.py
cf6998fd17f3ba389a0351e760b4451dd6329d16 evals/elsuite/basic/match.py
d19ff553184c31a679b7e23d65993d8b4efffafe evals/elsuite/basic/match.py
9debe2f4e9b589df81b06acde9457d5e12c38929 evals/elsuite/basic/match_.py
349e9b31725663bbc3926ed6b59f312af606c892 evals/elsuite/basic/match_align.py
271aef66dbb543c87b37759270c654a9adf3018e evals/elsuite/basic/match_all.py
b2e747f4d8e040005f8d3d86f5d74a0373e0e190 evals/elsuite/basic/match_all.py
44cb2b500d30c80d459b4e589a651a8884e2965e evals/elsuite/basic/match_gpt.py
6193169caeb37f2dd67b17cdd72e513303e8dc39 evals/elsuite/basic/match_gpt.py
86e96c1eb349acb413fd4e55650ea75e54ca6f92 evals/elsuite/basic/match_gpt.py
2c14392831c97e2151c2e4107f42886b9ff6e6b1 evals/elsuite/basic/match_nlp.py
5cd20559f3bbd0510690993173e444ee4ae141fd evals/elsuite/basic/match_nlp.py
71e3a795d976055d675037813f71fc6529a1eb4d evals/elsuite/basic/match_nlp.py
81b1f8326e9da7dfbb989613f01224f83810304c evals/elsuite/basic/match_nlp_gpt.py
7e6b162ca7d3f663077366174c2933a514cc9f06 evals/elsuite/basic/match_nlp_gpt_hallu.py
f7b346971cc1a23d637fde2bde7774227c6153d3 evals/elsuite/basic/match_nlp_gpt_hallu.py
a82e1807f2919888e367b19f0622c049ab20e24b evals/elsuite/basic/match_test.py
2feb57658d851f010ba95fe103ad80ecd6a41304 evals/elsuite/basic/match_with_solvers.py
f518418bd54b739300765143515b306b674bb4a5 evals/elsuite/bluff
a3ada149c14a7fe5a0157c9184a6ac58ac147803 evals/elsuite/bluff/README.md
03b6f1335c508b67d55bcbcccf96e03d64344620 evals/elsuite/bluff/bluff
b24b6c999cb1dac7377feff75265c5ed32c7dbad evals/elsuite/bluff/bluff/cards.py
624b009dd9b3b71473bc600f01763a2d1a137007 evals/elsuite/bluff/bluff/game.py
27de79f0b105decdea12a7a8a3328908c3af3422 evals/elsuite/bluff/bluff/players.py
1ac100a8d4aed6f57d787e9950f18aad49280a21 evals/elsuite/bluff/bluff/round.py
117384d5d10550225b6f95f11c4fb4350cc272db evals/elsuite/bluff/bluff/task_description.py
b789afa4ac58ad4d1e6f95e5ee28dcdf2c887427 evals/elsuite/bluff/bluff/test_bluff_game.py
29d7e9cd923222bc3495f8afa82ff2814147f9ca evals/elsuite/bluff/eval.py
362a68ee45f82c8aa758767d021c722c035e4600 evals/elsuite/bluff/prompts.py
835c3cc8e8e4b18534b3eca8277c57f7fba25fad evals/elsuite/bluff/scripts
1c3337a1ee8ccb65dec51e1e4d67c0dbb5d65178 evals/elsuite/bluff/scripts/make_plots.py
df4b44797e4abc7178c4d7073828d84fbb2bd01e evals/elsuite/bluff/scripts/run_experiments.sh
07289fde20559f9b2a24c7c2bacb2ca37f90bccd evals/elsuite/bluff/solver_player.py
1c5faa799be3aec4172df0c66f1e1305a47457f8 evals/elsuite/bluff/strategy_solver.py
0995d79234ea25c598ec5f952ff81cf5a527edee evals/elsuite/bugged_tools
9cc5edf29fd0662d18e4badc6d77d6f4c72406a9 evals/elsuite/bugged_tools/README.md
176bd5246e56894cef93ddbf34692f2ed38db1b4 evals/elsuite/bugged_tools/bugged_tools.py
38cbccd59479fc7d450dd2adf62dfe3f9a7f1cee evals/elsuite/bugged_tools/eval.py
33d97aa997624668fd38057178f13769ec98094e evals/elsuite/bugged_tools/scripts
478d9404b73f13bf0569a4a727d53659bcdf25c2 evals/elsuite/bugged_tools/scripts/plot_experiments.py
5f422ed3b045a582b792988ea2ca2c9439c41481 evals/elsuite/bugged_tools/scripts/run_experiments.sh
096dd8f52596550271d4e5917c9a37fc82f768c7 evals/elsuite/bugged_tools/task_description.py
ec3008a6becfa135462d31a5e6622837f4e92df6 evals/elsuite/bugged_tools/tools.py
c5c2f7b196cc23cadc1cdbfd7c39803186febf63 evals/elsuite/bugged_tools/utils.py
1a97291e91831b6f7054d58af99d3930d82b5a4e evals/elsuite/cant_do_that_anymore
e143f278b9c744064a617550a80b7235ce2187c4 evals/elsuite/cant_do_that_anymore/README.md
0e84ed5672d2debc30c54e733e328f78e02201bc evals/elsuite/cant_do_that_anymore/chess
5537b9d5f4eb335227ab819f50701868de4f08ff evals/elsuite/cant_do_that_anymore/chess/board.py
0d163f289c152d930cc892acfda4dbec945ca187 evals/elsuite/cant_do_that_anymore/chess/board_test.py
50f48c78e167ff258ffaee8c1591fa1aadd44f7d evals/elsuite/cant_do_that_anymore/chess/move_variants.py
3d7b113b51e2f120cbcb45861d8787e1cef984ba evals/elsuite/cant_do_that_anymore/chess/notation.py
9692a0170c5e8e95545163f9dc0ae22a763a93b7 evals/elsuite/cant_do_that_anymore/chess/pieces.py
a92d0720375f6ffd77f4969b6c5cd5c1aa6fb474 evals/elsuite/cant_do_that_anymore/chess/utils.py
8c8645f824100257bf388a43853e1de253d003fa evals/elsuite/cant_do_that_anymore/defaults.py
0ca6df5b0b430d9a0e16ea631a83f8e7bb954208 evals/elsuite/cant_do_that_anymore/eval.py
210db5eb178e8ae7b16366bce0bc06195c5c5422 evals/elsuite/cant_do_that_anymore/scripts
e0c7a0265abe65cb54ac02640e456a2e024aae13 evals/elsuite/cant_do_that_anymore/scripts/dataset_creation.py
491acf3c95927fe112db994eccfdc413f7a0f1e1 evals/elsuite/cant_do_that_anymore/scripts/diagonal_dataset_creation.py
bd0ea4d5cc3bc6046d6d335f975638b082d3fbfb evals/elsuite/cant_do_that_anymore/scripts/make_plots.py
68fe4ac5e70d79472f3649ca15ca69158f59e519 evals/elsuite/cant_do_that_anymore/scripts/run_experiments.sh
519aad85960342df34d858a22af2d915c3f68301 evals/elsuite/cant_do_that_anymore/utils.py
b7bdb9dd174ca48771d53e55a2cbb38564d32e17 evals/elsuite/error_recovery
78f6ceb726dfd939c457d7ac6f16d9e9b4358eee evals/elsuite/error_recovery/README.md
ee5560a08de4cd409ca9735ab17327cf737486d3 evals/elsuite/error_recovery/defaults.py
89512179fe5156c01007364e84697f3a19df9c35 evals/elsuite/error_recovery/eval.py
0bbef1472950ba6148b433c48a3868f1f3409e60 evals/elsuite/error_recovery/scripts
c6c14b24177f280e784481156b6b2c322b72d71e evals/elsuite/error_recovery/scripts/dataset_creation.py
0d2dcfaa4340dee2413ab5041cb7372a2dd59488 evals/elsuite/error_recovery/scripts/make_plots.py
36f51faad400d5c59159ba96aaaefad62fcac86d evals/elsuite/error_recovery/scripts/run_experiments.sh
363f937f90b1af753568f877a910ea6e422847a8 evals/elsuite/function_deduction
924b4e47fb830acca15544f078a01c1b2d7ca071 evals/elsuite/function_deduction/README.md
3a81624e03e3db0e082da986754f79e6fd6886bd evals/elsuite/function_deduction/baselines.py
65428521538fa4fbbda1bc7dc144909aa21f97f9 evals/elsuite/function_deduction/eval.py
6e36c3b3e9066d00d357ff2f1889ee761320dd72 evals/elsuite/function_deduction/prompts.py
287b7d264ef07d9e729b98faa5e9a97843ed41dc evals/elsuite/function_deduction/scripts
857219e16799e7b74f80a9f52867611e8f9b94ad evals/elsuite/function_deduction/scripts/dataset
931e1cc27af5e6754f186f15a038a85260ad0a31 evals/elsuite/function_deduction/scripts/dataset/create_dataset.py
ff03a0c76ee1ed82973b25f28f63f690aa4125ca evals/elsuite/function_deduction/scripts/dataset/raw_code.txt
4c8f5f5e782b1e35a1e1358c3af914e46440f6c5 evals/elsuite/function_deduction/scripts/make_plots.py
4e67f5c7be4349e3eb55af3f56fe792aaaba9e2d evals/elsuite/function_deduction/scripts/run_experiments.sh
96ace2cc15322903a780a5db7d41025fdea272b0 evals/elsuite/function_deduction/solvers.py
9a876951bc0eeb6b82e56dfde1ada4a35dd31d31 evals/elsuite/function_deduction/solvers_test.py
0cc19baaa4aae01ea095e71c7b3e9797d51f5837 evals/elsuite/gpt_eval.py
0fab748baa33f8aa8f954cc69b760393b063a735 evals/elsuite/gpt_eval.py
bbc0e70e704816b6c71147ee4ba3f79c92c7afd4 evals/elsuite/gpt_eval.py
63a124b2b8151317abbe5ae2e77963352290ce26 evals/elsuite/hallu_eval.py
bf7b8043dd0587a18761d3bdfc9c7e3108ccf995 evals/elsuite/hr_ml_agent_bench
2e7740efd9feffaa37d1bbabe9e518cc90c1e350 evals/elsuite/hr_ml_agent_bench/.gitignore
eafef9c7ee71f7518a049acc9247064659ca7781 evals/elsuite/hr_ml_agent_bench/README.md
a0f05d3773108aa7c89a8bca1a840b5fbf7a5f00 evals/elsuite/hr_ml_agent_bench/actions.py
5f43153e691b2e2a7800932ae8ade812a04214ec evals/elsuite/hr_ml_agent_bench/auto_marking.py
c44b79beb5ede479a227f0091dc3f7f845f1e0f3 evals/elsuite/hr_ml_agent_bench/autoeval.py
97d5ca731de371139f125bcb3603a0a1f95c3bcb evals/elsuite/hr_ml_agent_bench/benchmarks
5ffb7631a2b7d2a453aecfc7196372b84c2c7365 evals/elsuite/hr_ml_agent_bench/benchmarks/ant
c3383baf7930c5c45d7618e78493fb0ff6c44ba4 evals/elsuite/hr_ml_agent_bench/benchmarks/ant/baselines
e70d75a17859c9d2fe89512210bf85be33d52a8e evals/elsuite/hr_ml_agent_bench/benchmarks/ant/baselines/human.py
e7578d8194bf989c8332f4248a90bb20356c9fdf evals/elsuite/hr_ml_agent_bench/benchmarks/ant/baselines/naive.py
8d5ad6527d8087400796fb603e4aa3effc2f6801 evals/elsuite/hr_ml_agent_bench/benchmarks/ant/env
7af7e12ecc3a666fdf74987a63be0b89243567bf evals/elsuite/hr_ml_agent_bench/benchmarks/ant/env/environment.txt
5414766b8b1c6118e7777bf3ffa4ffc2528406af evals/elsuite/hr_ml_agent_bench/benchmarks/ant/env/train.py
a532b1d31f5fd7e37a4307bd2c79b3276964c440 evals/elsuite/hr_ml_agent_bench/benchmarks/ant/scripts
a17c4e43518e31913f306f34c06de84135f09329 evals/elsuite/hr_ml_agent_bench/benchmarks/ant/scripts/grade.py
d76c310f2c3841cd264257d21923bd525845b8f1 evals/elsuite/hr_ml_agent_bench/benchmarks/bipedal_walker
4127531ff5e33e61fc7a656c2c075253cbd641ad evals/elsuite/hr_ml_agent_bench/benchmarks/bipedal_walker/baselines
cf447fe30547aa26b53bf88c5e16caceebec8ed4 evals/elsuite/hr_ml_agent_bench/benchmarks/bipedal_walker/baselines/human.py
532c0f7fdb0e06b62676f602e76ddd9d5f1fe1bc evals/elsuite/hr_ml_agent_bench/benchmarks/bipedal_walker/baselines/naive.py
df59fa152a92a761bfc09eaa1e3bec5b8490b0af evals/elsuite/hr_ml_agent_bench/benchmarks/bipedal_walker/env
117c4d910451d193d265f23ab7cf9101c0166137 evals/elsuite/hr_ml_agent_bench/benchmarks/bipedal_walker/env/environment.txt
6496941bfed486e91e97e5485cab01517aa1299d evals/elsuite/hr_ml_agent_bench/benchmarks/bipedal_walker/env/train.py
a628bf90e2f311aefe799a65121ba0d4a9f8b39d evals/elsuite/hr_ml_agent_bench/benchmarks/bipedal_walker/scripts
ac160908859215a50a578f6ebdee95b6502dd9d5 evals/elsuite/hr_ml_agent_bench/benchmarks/bipedal_walker/scripts/grade.py
c4c61681b8cbd6d37a9231ce9ea2d07360327aac evals/elsuite/hr_ml_agent_bench/benchmarks/bipedal_walker/scripts/requirements.txt
9d2a0b6f1d70b0259b14061bb59f84f42c49d582 evals/elsuite/hr_ml_agent_bench/benchmarks/cartpole
d7561620199b1b283bda7000ffea0fe585c087a9 evals/elsuite/hr_ml_agent_bench/benchmarks/cartpole/baselines
dbfa721ddd01482ce5ce85e1b62cb614e1ba7048 evals/elsuite/hr_ml_agent_bench/benchmarks/cartpole/baselines/human.py
c738367ce7654811a78df14cfa714f5d9f7001bc evals/elsuite/hr_ml_agent_bench/benchmarks/cartpole/baselines/naive.py
4af20e061bcaa31447b78879e7b5432e7424d41f evals/elsuite/hr_ml_agent_bench/benchmarks/cartpole/env
9ab00b42de2625526940756a8d55043087a661e5 evals/elsuite/hr_ml_agent_bench/benchmarks/cartpole/env/environment.txt
264b746d5ba01c840595b48a1fe7550d3da0dd90 evals/elsuite/hr_ml_agent_bench/benchmarks/cartpole/env/train.py
7315e1337537bed0b7ad649af6d014629769d6f8 evals/elsuite/hr_ml_agent_bench/benchmarks/cartpole/scripts
6991614471bdaada9d36c09fdf9c3b6d677727a5 evals/elsuite/hr_ml_agent_bench/benchmarks/cartpole/scripts/grade.py
4f93689309493071a2109afbbb4243c14b3bb9e7 evals/elsuite/hr_ml_agent_bench/benchmarks/cartpole/scripts/requirements.txt
ad840838c2f808c6bf2dccdde366d34a61e939dc evals/elsuite/hr_ml_agent_bench/benchmarks/cifar10
3c851184e5553e496d2fb2c257c0dea741643cbc evals/elsuite/hr_ml_agent_bench/benchmarks/cifar10/.gitignore
c4604fb923970736fda1a52770973ff53dcc07b9 evals/elsuite/hr_ml_agent_bench/benchmarks/cifar10/env
5114dde5d23b8767c96862b13d3e75b602bd4658 evals/elsuite/hr_ml_agent_bench/benchmarks/cifar10/env/train.py
539c9283c01f83fae72c88ebd0ed3842af9caa12 evals/elsuite/hr_ml_agent_bench/benchmarks/cifar10/scripts
c4c05b0233c895a40f64acc0191ae595e3475188 evals/elsuite/hr_ml_agent_bench/benchmarks/cifar10/scripts/grade.py
69dad1e04cc0fc0e641b904145e46fe7530e63bb evals/elsuite/hr_ml_agent_bench/benchmarks/cifar10/scripts/prepare.py
07f43b870eb6ed97c32d56947e935a0a6939f3c7 evals/elsuite/hr_ml_agent_bench/benchmarks/cifar10/scripts/read_only_files.txt
e35531e566f2a925d851b9d3b8fa99645838e6e0 evals/elsuite/hr_ml_agent_bench/benchmarks/cifar10/scripts/requirements.txt
3ff48bb60a4b7543b36f0f3e751fb371f229d1a4 evals/elsuite/hr_ml_agent_bench/benchmarks/feedback
bfeab87ad58b3db6aa4af70ab3959086c23f124e evals/elsuite/hr_ml_agent_bench/benchmarks/feedback/.gitignore
e0205e03322b8c755712d7e2032f4868ab03dc0d evals/elsuite/hr_ml_agent_bench/benchmarks/feedback/env
6fd95592b7d2b20b31fd8c66d907cb15d029abd6 evals/elsuite/hr_ml_agent_bench/benchmarks/feedback/env/data_description.txt
a0f730b09b35ce85a902107e1d39f4857273e827 evals/elsuite/hr_ml_agent_bench/benchmarks/feedback/env/evaluation_details.txt
8efec86ac656647a247aa2eb9811da8f7a993aef evals/elsuite/hr_ml_agent_bench/benchmarks/feedback/env/train.py
99aac2378b6be9db553ed1096eda641615e7d2d3 evals/elsuite/hr_ml_agent_bench/benchmarks/feedback/scripts
c14bbe6d28bb09d6562689504624c87b7fb3c915 evals/elsuite/hr_ml_agent_bench/benchmarks/feedback/scripts/grade.py
ec2b12cd967a74b6c22692a686c5d822189322ca evals/elsuite/hr_ml_agent_bench/benchmarks/feedback/scripts/prepare.py
b52a2f84949552fdc7bbdacefeb1a171b482a09a evals/elsuite/hr_ml_agent_bench/benchmarks/feedback/scripts/read_only_files.txt
ad5bd865d3ac41ea169952322fa87abe9622e42b evals/elsuite/hr_ml_agent_bench/benchmarks/feedback/scripts/source_code.txt
d6991d6425039a9f92ea69af9706d4f0cec18138 evals/elsuite/hr_ml_agent_bench/benchmarks/house_price
cce8f148f12d891b1471b8fffce80a343492e964 evals/elsuite/hr_ml_agent_bench/benchmarks/house_price/env
cba0710286136ba64d273612bda60742f00f1501 evals/elsuite/hr_ml_agent_bench/benchmarks/house_price/env/data_description.txt
eae33a1a175ed7edac57c096740d0a3309f12985 evals/elsuite/hr_ml_agent_bench/benchmarks/house_price/env/train.py
fd8253981b324b8420e40a0b6ba274f8d029c350 evals/elsuite/hr_ml_agent_bench/benchmarks/house_price/scripts
771bb34be60a28961be9c03860bf948dfd60df10 evals/elsuite/hr_ml_agent_bench/benchmarks/house_price/scripts/grade.py
e59a0264a736510e9a574bc1eb52e44b2b450d93 evals/elsuite/hr_ml_agent_bench/benchmarks/house_price/scripts/prepare.py
a8231dab7337264eb28722a6eaf6f7a0447d22b6 evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid
fada82e9e4847d847e3b63080ec4ca081a254a0e evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/baselines
bec29d3eba8cd05e175498ab790a6c9e264167ab evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/baselines/human.py
b7ca4c9de0bbb873b64fe028352f5866dc6020e8 evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/baselines/naive.py
85ac26e91a059aa589cdb3e0164dc7d8a567c53e evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/env
800da9bb66f869e72b3f097f82aa7b1d0c6ecd9e evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/env/environment.txt
e84e1f53148b5bbf68b1a1823b488b8a8292cd30 evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/env/train.py
e548cadc8b589e3718bc73a34116fd73848c5b24 evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/scripts
2e4adab2f9c650f71c3084f2f202f4dcaaa4f05c evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/scripts/grade.py
46ce126c9e9d4b5428d0b60337d4ba8178a2f2bf evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/scripts/requirements.txt
55c251832d489209aa67dbd1741acf97f2e743e5 evals/elsuite/hr_ml_agent_bench/benchmarks/imdb
e43c73b02bccc722ef8065f8d83d432f7b2e984d evals/elsuite/hr_ml_agent_bench/benchmarks/imdb/env
6d4db12f37efb5b1bbd47201bb8a143059f8e050 evals/elsuite/hr_ml_agent_bench/benchmarks/imdb/env/train.py
e91bcee5c2f5acc60ffce0038645501d75b9d8c9 evals/elsuite/hr_ml_agent_bench/benchmarks/imdb/scripts
42c7fd3dbe3afb584d4e97fec71027f6213bedf7 evals/elsuite/hr_ml_agent_bench/benchmarks/imdb/scripts/grade.py
3db5954c2bb8f2617e8ba2e186caac386754ae2b evals/elsuite/hr_ml_agent_bench/benchmarks/imdb/scripts/requirements.txt
6f3dcb8e48843e199a8f60252e9c5610b6fb510e evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum
7bde1d9cc45883e184909462742eac834aa0947f evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum/baselines
6540875334b534b64dfc998be55312b3790ff4e5 evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum/baselines/human.py
4f9f158d9c3c8a91a11d32b5341b8685f88760a4 evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum/baselines/naive.py
399b00fdeceacf5e7c975f0caec05f55ae06f04a evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum/env
a04f440a6938668d8c6790740a576f66c087d00d evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum/env/environment.txt
d59bad07dc294f88adeaa3f74a0850033944e32a evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum/env/train.py
edef10d6e449cc932064b8014fe25d140d03ed2b evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum/scripts
27aefb7ac9ebedffaba65200d5bff204468c32e0 evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum/scripts/grade.py
ed98944a0c78e8028bc3952b06b9a1ca7b0bc0c3 evals/elsuite/hr_ml_agent_bench/benchmarks/ogbn_arxiv
d2b41cdc4d7b99bb5255901cfb0ed6bc80540a86 evals/elsuite/hr_ml_agent_bench/benchmarks/ogbn_arxiv/env
662909eb0c9585de8bd28c578493a3d93cf23562 evals/elsuite/hr_ml_agent_bench/benchmarks/ogbn_arxiv/env/train.py
ed996411847a2ad958cd47fbf3eb40d9bad65b42 evals/elsuite/hr_ml_agent_bench/benchmarks/ogbn_arxiv/scripts
979a5ffa818fe4869ad9b5455b48bc7cbbe3e87c evals/elsuite/hr_ml_agent_bench/benchmarks/ogbn_arxiv/scripts/grade.py
32975c37465c121803ec89b75c6532a4579ccc1b evals/elsuite/hr_ml_agent_bench/benchmarks/ogbn_arxiv/scripts/prepare.py
ba133ed98119d6a3ee41ab82afcd61181f4cdf32 evals/elsuite/hr_ml_agent_bench/benchmarks/ogbn_arxiv/scripts/read_only_files.txt
874102dd66aedf035771d9552f4dfbc7f3d53d06 evals/elsuite/hr_ml_agent_bench/benchmarks/ogbn_arxiv/scripts/requirements.txt
8a78fa4e997b1aeefab8a0eb118885ad965ae96b evals/elsuite/hr_ml_agent_bench/benchmarks/parkinsons_disease
1df175f5a9d656596a4ac11d10aa7ebaeb7cb284 evals/elsuite/hr_ml_agent_bench/benchmarks/parkinsons_disease/.gitignore
a23ff894730f847682c5e8d599251e58330e97cd evals/elsuite/hr_ml_agent_bench/benchmarks/parkinsons_disease/env
36cfae892c480e07c7b9acdba2a6d5056a2fb74e evals/elsuite/hr_ml_agent_bench/benchmarks/parkinsons_disease/env/data_description.txt
1cb872403c0b6951af786e2da9e42da79d0cb4d2 evals/elsuite/hr_ml_agent_bench/benchmarks/parkinsons_disease/env/evaluation_details.txt
eaa03676dca2fa89d4ad4821d210035de0dafa20 evals/elsuite/hr_ml_agent_bench/benchmarks/parkinsons_disease/env/train.py
126e6388746bc6e052feb4a6cb337247a7cb85d5 evals/elsuite/hr_ml_agent_bench/benchmarks/parkinsons_disease/scripts
482992710700e36aa7e1c63fb1542de68025f4f7 evals/elsuite/hr_ml_agent_bench/benchmarks/parkinsons_disease/scripts/grade.py
32d28f7ee7abeefc45be21ea682730ca686a8894 evals/elsuite/hr_ml_agent_bench/benchmarks/parkinsons_disease/scripts/prepare.py
60d3d323682051d00feeb235ce72afe52b9ac014 evals/elsuite/hr_ml_agent_bench/benchmarks/parkinsons_disease/scripts/read_only_files.txt
dc72e01dbd1747b16ba7977187bfe81c23267a13 evals/elsuite/hr_ml_agent_bench/benchmarks/parkinsons_disease/scripts/source_code.txt
6870c9d181a496b41b2332a4ea807ec80e9c1613 evals/elsuite/hr_ml_agent_bench/benchmarks/pong
b4cc43a02d69a34d65b9012441d9d56c958ea9c4 evals/elsuite/hr_ml_agent_bench/benchmarks/pong/baselines
3a6deaffaed640c62399491f6b0f2361f334e80f evals/elsuite/hr_ml_agent_bench/benchmarks/pong/baselines/human.py
c72b6cd030a966357c36e69243d2df0cd9e78533 evals/elsuite/hr_ml_agent_bench/benchmarks/pong/baselines/naive.py
6f5f2f8058045d8b9da7aeabdd44f3e941ecf0ae evals/elsuite/hr_ml_agent_bench/benchmarks/pong/env
b36abbfa645ae02c59f5a86a6942937964e25b75 evals/elsuite/hr_ml_agent_bench/benchmarks/pong/env/environment.txt
0bbfe17f15204cac756f3d8a567aef641757d7d6 evals/elsuite/hr_ml_agent_bench/benchmarks/pong/env/train.py
834de325dbfde17437a3a6f0c598da705d629409 evals/elsuite/hr_ml_agent_bench/benchmarks/pong/scripts
2c25c814e3030b83db3e60dc490243aa6ec1f5be evals/elsuite/hr_ml_agent_bench/benchmarks/pong/scripts/grade.py
81a94b1a1e9e2e4b432145de67f25bac775d84f7 evals/elsuite/hr_ml_agent_bench/benchmarks/pusher
19259e5d89c11b086d594859adebddcfbfb35bcb evals/elsuite/hr_ml_agent_bench/benchmarks/pusher/baselines
39cd471c5e10d0bc123127cec2b805aa3806c1f0 evals/elsuite/hr_ml_agent_bench/benchmarks/pusher/baselines/human.py
cbad322131dd3d206c5da1307b873cd74037b39c evals/elsuite/hr_ml_agent_bench/benchmarks/pusher/baselines/naive.py
310321430b1f835f34a61ee2d9b9b9d95fc5d002 evals/elsuite/hr_ml_agent_bench/benchmarks/pusher/env
04db64bb1997474478dc08df0d115abb863fe7d6 evals/elsuite/hr_ml_agent_bench/benchmarks/pusher/env/environment.txt
fac1b6cbfa3f9ae09c482a156b7c13c184a62e02 evals/elsuite/hr_ml_agent_bench/benchmarks/pusher/env/train.py
22fed47f529663ce34179e888f19be1134078f81 evals/elsuite/hr_ml_agent_bench/benchmarks/pusher/scripts
c5cf438bb169838c3e700fc5b4554d8ae7fead16 evals/elsuite/hr_ml_agent_bench/benchmarks/pusher/scripts/grade.py
46abe05863c25bbd8cdd6a382956de850b35087b evals/elsuite/hr_ml_agent_bench/benchmarks/spaceship_titanic
1509acb42f9981f4d0e62a8a0e8e53465672bf9f evals/elsuite/hr_ml_agent_bench/benchmarks/spaceship_titanic/env
6c430f13a25d4f68ba60be8c24794248656840c5 evals/elsuite/hr_ml_agent_bench/benchmarks/spaceship_titanic/env/task_descriptor.txt
861594e53f600050c0b8f22871acb9d99b8a80fa evals/elsuite/hr_ml_agent_bench/benchmarks/spaceship_titanic/env/train.py
c291f7b9d5cb448cb2da5d97f4b28826f0827dc0 evals/elsuite/hr_ml_agent_bench/benchmarks/spaceship_titanic/scripts
040a3774328587d4e3932a8241884edd142fa272 evals/elsuite/hr_ml_agent_bench/benchmarks/spaceship_titanic/scripts/grade.py
4a4c2e4fde6288269d5617807ee023c0ea657b37 evals/elsuite/hr_ml_agent_bench/benchmarks/spaceship_titanic/scripts/prepare.py
10ddd5b71e19a3523db9e828df5a09693d2d18f7 evals/elsuite/hr_ml_agent_bench/benchmarks/spaceship_titanic/scripts/requirements.txt
1b18fff4edacf78fd21ce540e26739dc2cee0292 evals/elsuite/hr_ml_agent_bench/benchmarks/spaceship_titanic/scripts/source_code.txt
d66b798f59880c89b44d5bbd3bc2bd632ea21cf3 evals/elsuite/hr_ml_agent_bench/benchmarks/vectorization
70253df06a89f96c5585c47b98631c45c221a946 evals/elsuite/hr_ml_agent_bench/benchmarks/vectorization/env
0ff942dc364f951632e8cde65ce8bcf880852fc2 evals/elsuite/hr_ml_agent_bench/benchmarks/vectorization/env/train.py
f45f642216e1f0f0756ce7778bbe3ec484b4bd5a evals/elsuite/hr_ml_agent_bench/benchmarks/vectorization/scripts
8133f3a6a462639a77d06141f0d589318f9c0e0c evals/elsuite/hr_ml_agent_bench/benchmarks/vectorization/scripts/grade.py
ea0aacfc3adb370269c52002193e87bc8f549ab4 evals/elsuite/hr_ml_agent_bench/benchmarks/vectorization/scripts/human_baseline.py
d3a388477ba5faafd2e92a312cdfa01630f9c7af evals/elsuite/hr_ml_agent_bench/devcontainer.json
026e3ea22487d35f5f8e2e1ff21bd95d5202cede evals/elsuite/hr_ml_agent_bench/environment.py
611be177900c911795aa649c4a2e4d065db88601 evals/elsuite/hr_ml_agent_bench/eval.py
8383376367e7ad5fc469e03e2ef2fe0b3b1bc504 evals/elsuite/hr_ml_agent_bench/high_level_actions.py
10ab2c93c1803d390a570c824c4bc852bd199d72 evals/elsuite/hr_ml_agent_bench/low_level_actions.py
5d0ba7cacdec1d8f493bbd5d850e811bda516930 evals/elsuite/hr_ml_agent_bench/prepare_task.py
c25f9e9bc49893c14182cd08fba0a0071713edc8 evals/elsuite/hr_ml_agent_bench/prompts.py
16d6e76a047e2819b498ad068dc6e10afc29f43c evals/elsuite/hr_ml_agent_bench/requirements.txt
01f9a079406a0dcc19a85c9c127a7d7cc1caaa00 evals/elsuite/hr_ml_agent_bench/schema.py
c5dd7291d7eda3e708fc181f1b1633891de99e9f evals/elsuite/hr_ml_agent_bench/scripts
85d54e3d29499d37bd09a6bd31b6032f6cd310fb evals/elsuite/hr_ml_agent_bench/scripts/install_all_requirements.sh
1e849dca915675514219f61d5d9643d3b3f65f28 evals/elsuite/hr_ml_agent_bench/scripts/plot_experiments.py
ba591847f4aef8d02b8578d1e1628aed89bca969 evals/elsuite/hr_ml_agent_bench/scripts/run_experiments.py
9d2c5e9bd9c76be1496bda272458d3def5bec66e evals/elsuite/hr_ml_agent_bench/solvers
531d3a08c90570761432c01f24baa0da4bbc5449 evals/elsuite/hr_ml_agent_bench/solvers/baseline.py
776d65905166201fc8a3422f4c885970a65c745b evals/elsuite/hr_ml_agent_bench/tests
0fe9a834a424509aab214a6e122a70a30dd4379c evals/elsuite/hr_ml_agent_bench/tests/test_actions.py
c37d8b1c4fb3f4dcc3d390ae19e2852d8b2f0c27 evals/elsuite/hr_ml_agent_bench/utils.py
cf3bce679d03dd7930c0480a17576372224704b7 evals/elsuite/identifying_variables
e256da66cb94484c05478dec79552c30f5678908 evals/elsuite/identifying_variables/.gitattributes
59912f0b275a5ca01c9cb63cbbeea375ac8b4d6c evals/elsuite/identifying_variables/README.md
60729828c7fa62743591cb1cec4d25f29bfb506f evals/elsuite/identifying_variables/constants.py
31b3b743e06b5c7eb162284738a2a4c5e496e438 evals/elsuite/identifying_variables/eval.py
815ab968ccccb28ca62d2bfbd21498dc95f415ba evals/elsuite/identifying_variables/graph_utils.py
a37f9fc094015e059c7dd137008ebdc68b352df2 evals/elsuite/identifying_variables/images
59de243e2942eb93d0cc85d6f51defedc2fae383 evals/elsuite/identifying_variables/images/control_var_tree.png
d005e47b4724470b7dac0d03f5d35a294a7421c5 evals/elsuite/identifying_variables/images/valid_hyp_tree.png
6f66a1c44e6907f5f9714281b83034ab6641ea9a evals/elsuite/identifying_variables/latent_funcs.py
501ec3b1a9ed3cbcc14ae73f64ba2edaa9253cf4 evals/elsuite/identifying_variables/metrics.py
4c2eecefdaaa62380d2ea231594fc2509f74afa0 evals/elsuite/identifying_variables/prompts.py
648fd6694b79b1c2182a6ee20066a1df5dbd9572 evals/elsuite/identifying_variables/renderers
c15562476148e67a0f054a5361f66886200fae7e evals/elsuite/identifying_variables/renderers/__init__.py
90c1d27ae5de142286af324d9dfcf3e790b1fa51 evals/elsuite/identifying_variables/renderers/base.py
39563527a6a6f6b6d7c3d957def2e2c3d1d52079 evals/elsuite/identifying_variables/renderers/corrset.py
0feb8b38fe325fcf76ebcdfd76cac8480051aff6 evals/elsuite/identifying_variables/renderers/tabular.py
c7a90000722b7ebb18b53330ffe4d6fa62009b99 evals/elsuite/identifying_variables/renderers/templates.py
18446f8d29875583f0bb720fb68f7baf0f10b624 evals/elsuite/identifying_variables/scripts
418ebe3fef26a67d53aa4c0b8470358d3e77a445 evals/elsuite/identifying_variables/scripts/data.sh
14c5f78e28a66fe862b14fc7f457c1bc2812feda evals/elsuite/identifying_variables/scripts/gen_data.py
f29f78149201fe8508da3ab0b37a06c30776147d evals/elsuite/identifying_variables/scripts/make_plots.py
1c80aab0422f8e2be3ed77e387afa0e52b61ae23 evals/elsuite/identifying_variables/scripts/plotting_utils.py
fae5ceb93be6000adb0859e3fc8ca5b3d9437dd5 evals/elsuite/identifying_variables/scripts/run_experiments.sh
3991cd469bc8d5b0d63eb90ffa0c86c1612181d1 evals/elsuite/identifying_variables/scripts/table_utils.py
c6010c74dae8da74d2be973abf811878782000aa evals/elsuite/identifying_variables/solvers.py
90b47b96b0714cc32d9de8ea078c71c5220f4b3f evals/elsuite/identifying_variables/structs.py
6918926bdf087080f4c1a34e1006df63a991bb44 evals/elsuite/identifying_variables/utils.py
7c72892ae9c33a5de31bc58065a5edb4f1f7f92f evals/elsuite/incontext_rl
69dbde303b67fa3f5de7babc94ebb01dcc9c7dd4 evals/elsuite/incontext_rl/README.md
40b3997e3ce30224c1bd7dc2176bf0cbf192d11e evals/elsuite/incontext_rl/anti-cot_solver.py
34f65d6cafb87d396683c780b392cb008e5373d3 evals/elsuite/incontext_rl/baselines.py
2f1cb06fc2773cc085c9706cea2758e47d83018b evals/elsuite/incontext_rl/defaults.py
31ffcba53490a8de47bdbb2ffc861dfd4dd584c2 evals/elsuite/incontext_rl/env_setup.py
a1fac2101ea3316c5181785a1c1e095e61e35c53 evals/elsuite/incontext_rl/eval.py
2712d1140bb44dbc91adbe66b0ff24b162633021 evals/elsuite/incontext_rl/requirements.txt
67052c5ce952f384697a5ce5c14eb1544c602874 evals/elsuite/incontext_rl/scripts
9e8e27f82b78884e2ccc88f5eb5694af920500ed evals/elsuite/incontext_rl/scripts/plot_experiments.py
bb2dc7ae449622d986f792acd23ef41e671c9c18 evals/elsuite/incontext_rl/scripts/qlearning_baseline.ipynb
9d8765dc22932cf8ae086bd3e6965034b40a3884 evals/elsuite/incontext_rl/scripts/run_experiments.sh
fb3bb356ef80bc0881e000f8a320634e4affb9d2 evals/elsuite/lambada.py
03f5884d136b5d1866017f4f714accf813af731a evals/elsuite/make_me_pay
9b2b8b127533b547992126180f13e5def0822b5e evals/elsuite/make_me_pay/eval.py
915e305e89500b425176900e0195542b995189fb evals/elsuite/make_me_pay/makemepay.py
be3076ab7b5228b8385b79625cdb4bf43e9ca37b evals/elsuite/make_me_pay/makemepay_test.py
46e2d34dff307ba202b9a108daa66567a2eab730 evals/elsuite/make_me_pay/readme.md
3c831ddf199f721653e67cd85feb4feeb889ad7a evals/elsuite/make_me_pay/scripts
fd7ebb048371b1d3a98f3f2615baabac7dbaa38b evals/elsuite/make_me_pay/scripts/make_plots.py
43765bc935d03b90a7990cfeab85bbf4b23da2b5 evals/elsuite/make_me_pay/scripts/run_experiments.sh
64a14cfcab2af8ca8bd725e7d1301aaedb50581c evals/elsuite/make_me_pay/scripts/run_experiments_longer.sh
1288d0e3c0c929da47664424bce3b20fff942cb5 evals/elsuite/make_me_pay/scripts/run_experiments_personality.sh
13575c00d8f36240c39a0abe183e6d9d49217bfc evals/elsuite/make_me_pay/solvers
0f3c6c42b81482f91e671a8ed92ebab08ee2f74c evals/elsuite/make_me_pay/solvers/lm_con_artist_solver.py
d8cc17dce282d7d3534fd33481f0e9e5b96c76d6 evals/elsuite/make_me_pay/solvers/prompts.py
62bf9cc21090bb9934ea9c4b0fcf9a2656edb964 evals/elsuite/make_me_pay/task_description.py
deb8f2c9ceaecbabd395ad41a1fcd922431f332d evals/elsuite/make_me_pay/utils.py
4f320b7feccf1345bbc74c4b33925be3ec5c2010 evals/elsuite/make_me_say
0a8fdcbe6fbd0b21cebb67d10e74e7ce536e1303 evals/elsuite/make_me_say/autoeval.py
e1b1d2181cfa5df7dafae58ef6036578ce47af6c evals/elsuite/make_me_say/core.py
9b9d67e141a8523554da22c1238ace4eabad2338 evals/elsuite/make_me_say/defaults.py
fcc41934048f2dc17385cf5f57bb2c3203550106 evals/elsuite/make_me_say/eval.py
93588df67c04aaadd6112396db2e92203944bc83 evals/elsuite/make_me_say/makemesay_test.py
f23c77ff105db11122d3ee937b9f476e29ef9710 evals/elsuite/make_me_say/readme.md
ba44b2950e765f625c17976bc3066919e40e03d6 evals/elsuite/make_me_say/utils.py
bcf1531ac74e50faab60ed50f0ce3f83c1ab70a3 evals/elsuite/mmmu
f338ba667e9e33e80c838a6866985fc250af0792 evals/elsuite/mmmu/eval.py
a9787d09c519796b371161da76977ad90f967bd4 evals/elsuite/modelgraded
6d5862804cf86c80b07bcff27e5fff12ab57eacd evals/elsuite/modelgraded/base.py
eb9661d7871ef15923a19f4179a44298cfb6924d evals/elsuite/modelgraded/classify.py
745ae5f0d2d83ed609623988f940050de385b44b evals/elsuite/modelgraded/classify_utils.py
302e8e5db38eb1c0cfe4f349d16e577b403d3ee2 evals/elsuite/multiple_choice.py
ce882621bfd859110504fcea7ae77498b7f86743 evals/elsuite/multistep_web_tasks
ef48d45fd4e1381c9466162bbc325f45dc427911 evals/elsuite/multistep_web_tasks/README.md
36aaa76641777e9e9156150527f1f17e93ef001c evals/elsuite/multistep_web_tasks/constants.py
213ae13ec2972bcfd51921a199445b3de3630ba6 evals/elsuite/multistep_web_tasks/docker
d5110631b2279e5a13f8e8ddfab48a4713dcd10e evals/elsuite/multistep_web_tasks/docker/dc-evals-bash
9d16c8789f0793eb63210429fabae3e00bcdbe03 evals/elsuite/multistep_web_tasks/docker/dc-evals-bash/Dockerfile
286a7255d37dbb52c7162122aaa2c60381ed4ac6 evals/elsuite/multistep_web_tasks/docker/flask-playwright
84ba0031b08c6bbfc28fb69e7bd363e5a43cdf5f evals/elsuite/multistep_web_tasks/docker/flask-playwright/Dockerfile
65386c6c9b113613f112e2ceefdd0a00c4362f91 evals/elsuite/multistep_web_tasks/docker/flask-playwright/app.py
85a06a4e4ace3e10e4a3f68ace09b65308c64c19 evals/elsuite/multistep_web_tasks/docker/gitlab
7302975e30cd5faad3da44ff0a31606d100e94e3 evals/elsuite/multistep_web_tasks/docker/gitlab/entrypoint.sh
aa0cb1abfbbded247705502c78061a4430d16fcc evals/elsuite/multistep_web_tasks/docker/homepage
c7bf95ef810c8c989e9d033695a00b74713f2a3c evals/elsuite/multistep_web_tasks/docker/homepage/Dockerfile
0b092c7ffa04900b92f70d3bf2816320e0f65b64 evals/elsuite/multistep_web_tasks/docker/homepage/app.py
af14c282fc8fd1ecfb4f1b2b0c721400135e8aa4 evals/elsuite/multistep_web_tasks/docker/homepage/docker-entrypoint.sh
8ab6294c6b5339b46e123e5ac62dd6651ae09c68 evals/elsuite/multistep_web_tasks/docker/homepage/requirements.txt
239df6f5261a4ff11752b542035ca2b7cf268ddd evals/elsuite/multistep_web_tasks/docker/homepage/static
a18754d25c28f490e64351e65927e654d9bae284 evals/elsuite/multistep_web_tasks/docker/homepage/static/figures
53b7013861baf76f47886aa9934facf8742811b0 evals/elsuite/multistep_web_tasks/docker/homepage/static/figures/calculator.png
7ea5b53b351d9d97a003debff49f7ec9ffd6a10f evals/elsuite/multistep_web_tasks/docker/homepage/static/figures/cms.png
a9c4af3b0ab08c85931b01cba7e7ebfcd26ed937 evals/elsuite/multistep_web_tasks/docker/homepage/static/figures/gitlab.png
0416212ff65c8d3967cd47c8f8291ed52d28489b evals/elsuite/multistep_web_tasks/docker/homepage/static/figures/manual1.png
be6c7796b2740626e9a9b25e03e696dda35ec978 evals/elsuite/multistep_web_tasks/docker/homepage/static/figures/manual2.png
6718f5140c5935f188083949baff8b29a7775fd4 evals/elsuite/multistep_web_tasks/docker/homepage/static/figures/map.png
2669443a40a7b9d6309aad5ce6b1a3977572aa5c evals/elsuite/multistep_web_tasks/docker/homepage/static/figures/onestopshop.png
891651313e38257f6714b1e08f03e3decbab65cc evals/elsuite/multistep_web_tasks/docker/homepage/static/figures/password.png
796b00613f64cc577d784b97d6d054d09e9373e8 evals/elsuite/multistep_web_tasks/docker/homepage/static/figures/reddit.png
4afea7fdd2806c5299051809b6b382b6c7900c40 evals/elsuite/multistep_web_tasks/docker/homepage/static/figures/scratchpad.png
aa4695991434c3d7d468058da47f74571a17c8c4 evals/elsuite/multistep_web_tasks/docker/homepage/static/figures/wikipedia.png
56a007a61f96775bf8cf8a1a728fdec029d09c89 evals/elsuite/multistep_web_tasks/docker/homepage/templates
644529851b07da31ed9399155e9721b05f2c53a6 evals/elsuite/multistep_web_tasks/docker/homepage/templates/calculator.html
98693d159bc5ba27ce063a50061b2495bb8b4887 evals/elsuite/multistep_web_tasks/docker/homepage/templates/index.html
bd939d572eb81bba4cc6924a9c5f2867425190ab evals/elsuite/multistep_web_tasks/docker/homepage/templates/scratchpad.html
2cd7289e761b3d1b1ced2ce30649574de36ab931 evals/elsuite/multistep_web_tasks/eval.py
cd3e3f8084030cc43644a99813155868e96393fb evals/elsuite/multistep_web_tasks/reproducibility
f62417e030cd823fa157ece3850c453ec8a12e49 evals/elsuite/multistep_web_tasks/reproducibility/CLEANUP.sh
6db5233d8ee5c83da1a8673a239cd464d1950342 evals/elsuite/multistep_web_tasks/reproducibility/all_tasks.json
238eb5ffef33cbb9f8fcebfbb68d3c8af6522d19 evals/elsuite/multistep_web_tasks/reproducibility/make_plots.py
8074762afddebfda57c9e54854df8cff24543fce evals/elsuite/multistep_web_tasks/reproducibility/make_task_jsonl.py
8a10b200b890daadf40896f071eb04762583cc1a evals/elsuite/multistep_web_tasks/reproducibility/run_environments.py
baf288687d441004c2b6515c58ee022a1578b5e0 evals/elsuite/multistep_web_tasks/reproducibility/run_experiments.sh
cee007d44fa1f805ce217f9a3221d1766c3cde75 evals/elsuite/multistep_web_tasks/reproducibility/run_once.sh
44125b6d436e58def8c5dbda19df407949318357 evals/elsuite/multistep_web_tasks/session.py
616ecd73f5e94f0015d9fe6209649e34339796e3 evals/elsuite/multistep_web_tasks/solvers
12cc05f1f3c5966cfa126ff91a8954af184c7abd evals/elsuite/multistep_web_tasks/solvers/strong_solver
68d2d24346991bd3d418686355b108c8c0314c0f evals/elsuite/multistep_web_tasks/solvers/strong_solver/strong_prompts.py
692f855ee48752cecb6077fe66049b12c284e8f9 evals/elsuite/multistep_web_tasks/solvers/strong_solver/strong_solver.py
8eb40844b23a7284261073c64d51d957d3b3ca7e evals/elsuite/multistep_web_tasks/solvers/webarena_solvers
c85a5e9fde530d0b2e89cb0b91539ddcfd57155d evals/elsuite/multistep_web_tasks/solvers/webarena_solvers/webarena_prompts.py
3a7d841d341c1aa144c8bac0888c8a9f274fb9e5 evals/elsuite/multistep_web_tasks/solvers/webarena_solvers/webarena_solvers.py
9ce1d64201309bb794490dab66016cdfe8829ce0 evals/elsuite/multistep_web_tasks/utils.py
acdf1d10e27871d29556168507284100ec9ec5d0 evals/elsuite/multistep_web_tasks/webarena
872e502830457e04891d3d91c5c34cfe4e27fdc0 evals/elsuite/multistep_web_tasks/webarena/.auth
3604cbd31b67f18c15fc4207728788eb9e30e96f evals/elsuite/multistep_web_tasks/webarena/.auth/gitlab.reddit_state.json
805bc912f99fabc2568a3a8ca4087b9630b3e717 evals/elsuite/multistep_web_tasks/webarena/.auth/gitlab.shopping_admin_state.json
5aba9407d9184415b6c40b1fc220ef5d437c9122 evals/elsuite/multistep_web_tasks/webarena/.auth/gitlab.shopping_state.json
d5bb6371a6b22f9332618652dd8033dc693382ff evals/elsuite/multistep_web_tasks/webarena/.auth/gitlab_state.json
b60bd35962c39e38c8952b0bb0d2f6da04cc91c7 evals/elsuite/multistep_web_tasks/webarena/.auth/reddit_state.json
7cb0e08973970ac1acc9da874ca364f6d4596213 evals/elsuite/multistep_web_tasks/webarena/.auth/shopping.shopping_admin_state.json
42eca05c86bb8df7c0521a490a5cdff02939a0fe evals/elsuite/multistep_web_tasks/webarena/.auth/shopping_admin_state.json
534762bcf4781729b41cc7f902f692919e50b0dc evals/elsuite/multistep_web_tasks/webarena/.auth/shopping_state.json
261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 evals/elsuite/multistep_web_tasks/webarena/LICENSE
2d53af5db5fb84965d70c554b0a7a8795eec49c5 evals/elsuite/multistep_web_tasks/webarena/bash_browser_env
1b088ae6f16c1481656c79ed9b63140fc757958d evals/elsuite/multistep_web_tasks/webarena/bash_browser_env/bash_browser_env.py
c7a302565296826f2421597bb3b4431e9d137748 evals/elsuite/multistep_web_tasks/webarena/bash_browser_env/bash_browser_utils.py
eb658c7b33d46a773034f3fc141c305c7d4273ea evals/elsuite/multistep_web_tasks/webarena/bash_env
af74ce3a51189e4f8aaca362f7ace65b4fa8a5e2 evals/elsuite/multistep_web_tasks/webarena/bash_env/actions.py
03a4a7bad00d15cb375bc8dcc7dfb5cd3d55d71d evals/elsuite/multistep_web_tasks/webarena/bash_env/bash_utils.py
f50b41db19ad9cb765d4dc69f93cb677e870becc evals/elsuite/multistep_web_tasks/webarena/bash_env/basic_bash_env.py
d30b46399c6da3a3cc5c6da5d79ec08d93391857 evals/elsuite/multistep_web_tasks/webarena/browser_env
efa59f4b58641fc9951d51b0bde6ff6dfd06d63f evals/elsuite/multistep_web_tasks/webarena/browser_env/actions.py
868b478c6292ddab2d47962c8a7ac61ed4b4b54e evals/elsuite/multistep_web_tasks/webarena/browser_env/auto_login.py
4e64018484c2f11f1f644692feba6ad56d59d606 evals/elsuite/multistep_web_tasks/webarena/browser_env/basic_browser_env.py
41575a98381cfc70fde779e8b5c195bb3eb6425a evals/elsuite/multistep_web_tasks/webarena/browser_env/browser_utils.py
4b8a4330acb1f55d9bae689ee9cf74c7a75c6fc8 evals/elsuite/multistep_web_tasks/webarena/browser_env/constants.py
0929d043e72200a197ea1c91cfc23a95f12837a4 evals/elsuite/multistep_web_tasks/webarena/browser_env/env_config.py
df70c59e780a15ca1c521a2174736a0c2fb620e6 evals/elsuite/multistep_web_tasks/webarena/browser_env/helper_functions.py
67fc57397f5c173d7150d294b24a633b35bfe290 evals/elsuite/multistep_web_tasks/webarena/browser_env/processors.py
95b497120f5a0c47c7cb1673da27ad3b4b3ae4e9 evals/elsuite/multistep_web_tasks/webarena/core
fd42cc2751c880e274a93d974dca8b569a2d26e4 evals/elsuite/multistep_web_tasks/webarena/core/env.py
651da13e6de921b653bc9f91f9099e7b02753a60 evals/elsuite/multistep_web_tasks/webarena/core/playwright_api.py
a637d2e6497b50e3acc6d0b874682275ca7f73bd evals/elsuite/multistep_web_tasks/webarena/core/utils.py
8825e955474acb2646a119d41a418a07377b1a8e evals/elsuite/multistep_web_tasks/webarena/eval_run.py
a3d4ecc75ae701846ac2b3eb2b0158606c6bd4b4 evals/elsuite/multistep_web_tasks/webarena/evaluation_harness
0d1e8dd15a4dc67ea6d7dff5bc06d40b78e687ff evals/elsuite/multistep_web_tasks/webarena/evaluation_harness/evaluators.py
9822994a8d2ecee26e85cd15d8253ba46fd0832b evals/elsuite/multistep_web_tasks/webarena/evaluation_harness/helper_functions.py
e272ddef51e7c834470fc0b2e7914d35bf5fd700 evals/elsuite/multistep_web_tasks/webarena/task_description.py
45ce23019739e39564ed1c7cf01a02ea679dc35a evals/elsuite/sandbagging
92ab8004b74f9d3f1919535a259ed4c811d81c29 evals/elsuite/sandbagging/README.md
5b474af0e419f81b822d47ca283a046bc5f65a18 evals/elsuite/sandbagging/defaults.py
ae421d8f625d887c4f22664483543859c3b81c15 evals/elsuite/sandbagging/mmlu_eval.py
675341a207fd6af2a71524edbbe1e9633f1bbec6 evals/elsuite/sandbagging/sandbagging_eval.py
85a1c036a546941bd92ec7b7daedc888a4730906 evals/elsuite/sandbagging/scripts
f129d1f0e6db6965224cb727230be9c18dfdb87a evals/elsuite/sandbagging/scripts/consistency.sh
18cf438fa42fe2811d516a2a89742042b5027516 evals/elsuite/sandbagging/scripts/consistency_plots.py
b10a5ac9135874e76bc6a36ce773fc9a85243288 evals/elsuite/sandbagging/scripts/sandbagging_all.sh
3300658c3c1e13c980ae04928b1a5134f315412e evals/elsuite/sandbagging/scripts/sandbagging_all_plots.py
debdd99a350516818eb8fb912eac96c4fd669e4a evals/elsuite/sandbagging/scripts/sandbagging_subset.sh
d4e01f92203fa6b80faf5737e2063cdc5de79251 evals/elsuite/sandbagging/scripts/sandbagging_subset_plots.py
3d862bd29a631f6ad8d5899e099dcea72972a7d7 evals/elsuite/sandbagging/scripts/utils.py
8ce27bc3f35a74bd6525ce6d405f2b646ce90c75 evals/elsuite/sandbagging/solvers.py
5bc205398e0fd16b563ebb7892164407a206b294 evals/elsuite/sandbagging/utils.py
59637edf26b82c2b8b16faabaa8ed74be93f856d evals/elsuite/schelling_point
86bafe1284a5a4fcbf1711048507d661b3c28036 evals/elsuite/schelling_point/README.md
46d5371af1e796149bf6e358b5e42754f89f3d3d evals/elsuite/schelling_point/eval.py
96079b89e9cd435e53a7c7d60378dd3aa2148ad1 evals/elsuite/schelling_point/prompts.py
92310ddd4f6834c8e1106bb7b91cad50e5b4ca4e evals/elsuite/schelling_point/utils.py
fb38b9c8ba71cc42062c2b3ed892e06c2a7356b5 evals/elsuite/self_prompting
7db858f5d4b939d3a49b9a5bac3436fa0b92a3f8 evals/elsuite/self_prompting/eval.py
bb5f2ee5cf6dfa93d44983e67363d9e23adba7c8 evals/elsuite/self_prompting/readme.md
5e554dd6ade079f5607b13fa01fc1ec39f1c5452 evals/elsuite/self_prompting/scripts
e594957f4b5c23e5f8527af8724ff26085cc7c1b evals/elsuite/self_prompting/scripts/dataset
6a5698c4e2dc5a5ad42d9644493cc2360e2d7a2a evals/elsuite/self_prompting/scripts/dataset/compile_data.py
782dcd4929b3a44fd84c54908aa1ed429465c02f evals/elsuite/self_prompting/scripts/dataset/eval_list.py
6d264e5e693d1486a115faace04c93a44af660d6 evals/elsuite/self_prompting/scripts/make_plots.py
6a00ada9ec4daf4cd85398a3b5b2291de1d5650b evals/elsuite/self_prompting/scripts/run_experiments.sh
9cf2518316359e23e194606beeaddb81be577be2 evals/elsuite/self_prompting/solvers
471008849482b818c934a3ee77a97e03685abf06 evals/elsuite/self_prompting/solvers/baselines.py
465f7f5e4d0048dd1e58d697a4e562b02c96f426 evals/elsuite/self_prompting/solvers/custom_cot_solver.py
e099e816d072d29f07f5b48bbdb9266dd126b21a evals/elsuite/self_prompting/task_description.py
2c988ce75c0e7018e2e0ab665f0c581a13a28830 evals/elsuite/skill_acquisition
52c770db7d89b5ef0327bbac51a0d62fc5c65088 evals/elsuite/skill_acquisition/eval.py
2d5a8fafcb14215e211e7b90d1bbdd1bff772464 evals/elsuite/skill_acquisition/readme.md
eb62e179fd4f7197d23d227e391b6d3ed65d11c3 evals/elsuite/skill_acquisition/scraping
c6d49a320cb972b27bfd20c0e867a0a1a898450c evals/elsuite/skill_acquisition/scraping/human_rights.html
93e248b10743ae2e249c1f8c778a5cfcff6d6e01 evals/elsuite/skill_acquisition/scraping/scrape_distractor_articles.py
697b5667cdd22ee61e65c5e3122bfedfc399d3df evals/elsuite/skill_acquisition/scraping/scrape_miskito.py
8cb84f6c708f52481eee9bd88b3ef07f325fba31 evals/elsuite/skill_acquisition/scripts
01eab83412634b842521a5638f778ebcd9ff7c48 evals/elsuite/skill_acquisition/scripts/make_plots.py
aaf81e0745a128bebca99a23ba174ae422842626 evals/elsuite/skill_acquisition/scripts/run_experiments.sh
16bd72f653d86f871c6c0c16cd679e6a48cf5989 evals/elsuite/skill_acquisition/solvers.py
5a356179d00f328b5347de82f48c598363c5e326 evals/elsuite/skill_acquisition/task_description.py
079fdcb4356424860157e992503a780ea2d05606 evals/elsuite/skill_acquisition/test_skill_acquisition.py
ef0ef09349af4bc052c08053b0ca625e577644de evals/elsuite/skill_acquisition/utils.py
8a13adf80bc3decbec314a53f48bba85af04eced evals/elsuite/solver_tools_convo.py
2a349bc35da89beff77cf30fc42de94eac98de02 evals/elsuite/steganography
e25e1bc55176c94017f57d1d48ad7ab27d7b9a48 evals/elsuite/steganography/eval.py
ba2b38756feb1991e8b4e62b057127238100a99b evals/elsuite/steganography/monitor.py
78d2fb1acebc5b785823f9376c7d9b120cc55dc4 evals/elsuite/steganography/prompts.py
91b5289b872340214d761ee2374b66508b6865ea evals/elsuite/steganography/readme.md
5dce8b39e8ee4b871cd01cccbaa1b01a573c9a59 evals/elsuite/steganography/reconstruction_metrics.py
cf829d3a1a73a1e30a5c738abe9583e6702310da evals/elsuite/steganography/scripts
e7977a55988b433e9900e9affca3be8969315fe9 evals/elsuite/steganography/scripts/dataset
26e8dad759b7193972e91e59dbe15c0304264960 evals/elsuite/steganography/scripts/dataset/README.md
50b94adc18530f60d7c4fee1bdabae02c8822088 evals/elsuite/steganography/scripts/dataset/complexity_metrics.py
a237aef23b21f6ce71f273a4850ce621a5c02e30 evals/elsuite/steganography/scripts/dataset/csv2jsonl.py
c259bb65ae571643d89f582d97216f63bc27d1b6 evals/elsuite/steganography/scripts/dataset/custom_datasets.py
c0cc4456cedc716cae1a093fcd4863bd4a7bac9a evals/elsuite/steganography/scripts/dataset/dataset.py
fb2eb4760f879a2f5d95dc0ba0caa4720e91eb62 evals/elsuite/steganography/scripts/dataset/requirements.txt
5069508fb8cc271bd7c5ad3dd3bc01dc0473b4a4 evals/elsuite/steganography/scripts/dataset/utils.py
0f0aee1a011928f2b94152b11037e95fb99b6151 evals/elsuite/steganography/scripts/make_plots.py
31dafb811d99e09621dbb02db6a6d7a8cc566847 evals/elsuite/steganography/scripts/run_experiments.sh
895375b1682c76bfb8234785253b02b31ae25ba4 evals/elsuite/steganography/steganography.py
01d350111e220b42fb47d45aa07044ffe987edd9 evals/elsuite/test
8885c0e81b2256ab99eddec92d65c26ba0263c2c evals/elsuite/test/match.py
f3d85c9a2b92017a9d0f918dbf38372d074db1c3 evals/elsuite/text_compression
0462a1d267077ba3a921b3274106a9df0878bc4f evals/elsuite/text_compression/compression.py
d2a620941bb28d968aa6aeb29c716ee97079311c evals/elsuite/text_compression/eval.py
bac7f819f9fa6e4bfd1489771adb570e9f6bc2ac evals/elsuite/text_compression/prompts.py
a4a5b706ead6ad1d3c1f8cb2c03ac76bd4f197da evals/elsuite/text_compression/readme.md
b4b9a7ce780f7072b80f9f0a790dd8526b85ad16 evals/elsuite/text_compression/scripts
f4f8815fb41bb95fde293545549be7dc38ebaa2e evals/elsuite/text_compression/scripts/dataset
8eba6d667e4c409c0cf286100e16b57c60ea2b02 evals/elsuite/text_compression/scripts/dataset/custom_datasets.py
2e20f3464c8e7aec03ef210d530d151992344d5d evals/elsuite/text_compression/scripts/dataset/dataset.py
479f03dfc5add6fa4cbcea2ddee7a7699936d338 evals/elsuite/text_compression/scripts/make_plots.py
96213a2e74da9398f09d74661b1282fcb639b223 evals/elsuite/text_compression/scripts/run_experiments.sh
42aa8e92e66228f2c277f6e5be903d4fa78733e5 evals/elsuite/theory_of_mind
1500cdca667de857f9ec33425579b9134151175d evals/elsuite/theory_of_mind/readme.md
b172163cd9bec80800f3df396e9b9d7970029728 evals/elsuite/theory_of_mind/scripts
0ac6075a6aabd29229b95254e05fcc4901117ad6 evals/elsuite/theory_of_mind/scripts/data_generation.py
ea6727560bf68bac2657ed1dbabb236bce57a5d5 evals/elsuite/theory_of_mind/scripts/make_plots.py
4087d816e5c7ac3ab29fda45c4b53de9c240ea44 evals/elsuite/theory_of_mind/scripts/run_experiments.sh
24d52249e89558d557fc465f68dc95cf01636769 evals/elsuite/track_the_stat
20c1580b2f6e707e3e7e64f0869a0c4d5673f8e3 evals/elsuite/track_the_stat/README.md
d1ca65d719c165a2b84ec12e94013824e97fa534 evals/elsuite/track_the_stat/eval.py
b2d394f22bbc6f9bad969a4b8c15a41a343133d3 evals/elsuite/track_the_stat/prompts
31d1b2de39cbca291de0fbc6c8615bec5f5ca31b evals/elsuite/track_the_stat/prompts/__init__.py
aae3c0ecc859ac62f6113e91c383fb56b1c906e4 evals/elsuite/track_the_stat/prompts/median.py
5756e7e55c54a43651e8bd8937dd1fbd753ea765 evals/elsuite/track_the_stat/prompts/mode.py
2e5116756f16da6fcb1b544d1e14167db3f5a732 evals/elsuite/track_the_stat/scripts
b40e4a3586122b22d41209103f60113f5daebc2d evals/elsuite/track_the_stat/scripts/make_plots.py
830786641838b70b85ab0337e4d5cb8aa32de5ad evals/elsuite/track_the_stat/scripts/run_experiments.sh
65721002cc980e31af8bf1001057ccaed487be87 evals/elsuite/track_the_stat/solvers.py
55467c5100a48cdf145c51a6beb137e97b7537ea evals/elsuite/track_the_stat/utils.py
b2892fc23d3e248e6bee02b91b38e96684d7923f evals/elsuite/translate.py
3a746cf9a790d3d73f99e4f433c3bea473fc238c evals/elsuite/twenty_questions
3cb0d5c857022995f9c2ee34c03bed9978c7def5 evals/elsuite/twenty_questions/eval.py
89f2e3ce6cd7d6e92b9babb15debf88d6114654c evals/elsuite/twenty_questions/readme.md
fae6f14fdd4ea061a54263de90e1e2b6fe38946c evals/elsuite/twenty_questions/scripts
f07b76da5a22a4324333bb4bbbee2897e105b758 evals/elsuite/twenty_questions/scripts/make_plots.py
4b8718d60782d28ab4f54463f642d286b91c3c05 evals/elsuite/twenty_questions/scripts/run_experiments.sh
069b86c8ad5e58065089854d0ec1669c7d5e5569 evals/elsuite/twenty_questions/test_utils.py
169dd0a97762e2763a47da721b6b1b42b212af80 evals/elsuite/twenty_questions/utils.py
2cc48941690ebeed9531cf4a9fe7e6535d24a3d8 evals/elsuite/utils.py
48fb8dc802a67b20a9ff29d5d50e48433bdb3edf evals/elsuite/utils.py
583608ec60c9d25c282c670df89f627b65dd6f7a evals/elsuite/utils.py
c35b27d58cf0b42e4d01df83db9f946fd7427004 evals/elsuite/utils_test.py
19ec758daf8e70bf3b634de0bd7c29ed8689df1b evals/eval.py
cce0c75c3fe7ad44a1d290c275d0c7a503d4d8c4 evals/eval.py
ec2a2843f388106a58fbbd51e4430ebb1113dec8 evals/formatting.py
bf68ed2fb6186c08b5bb7129ccef24e17bcbfa40 evals/metrics.py
e97c6270c47313689c2cfbdf360cd9011fdb8d81 evals/metrics.py
15f4f8cdbcde053cff99699e2050c22e45a98947 evals/prompt
1758d7c7cfabde8a970750de99954bfe815a8b46 evals/prompt/base.py
7bc6b98d15539f87d2b90359536802cd200739d4 evals/record.py
8e8ebe9ae62b4711817e08e668d4ac9a4b63cc23 evals/record.py
7807aeb792b419a882fffebe91d37d3014c3669f evals/record_test.py
1186cd1f19195de0a9264b9ae43d433da380a9f0 evals/registry
236e0d0e140a09f31feca9426c457765c4bcb72d evals/registry
36a8f6d34bc3e58be0e1d8d0e115e3f7eb093a9d evals/registry
4d29ae63ff0ea1d0dbebb7bd14139b660becf7a6 evals/registry
970fcea80242306e46d82dc49f35f5fb3b754dc0 evals/registry
991f8427525415bf08004b498066314935c5e7ce evals/registry
9e94623b08190db54e15ddc6bb82ada823d26a12 evals/registry
759b7c3997ade2a7939986df3afcbbfbeb792257 evals/registry.py
e999dd2df70596f8a1925909a5381d71a5bf95e2 evals/registry.py
30140f582b10e2cba869fe852d1e8fee927c73f6 evals/registry/completion_fns
328abcfcb25d4a165888647a6b70aac7bbcace5b evals/registry/completion_fns/cot.yaml
7a24650074d5f2d8d3b2d8d20705ef579036eb31 evals/registry/completion_fns/langchain_chains.yaml
d82e4e34a0be486ce33ca75e28831fe1e6937d1a evals/registry/completion_fns/langchain_llms.yaml
39c2b8f966f7b0a24543925d75496358303cf9dc evals/registry/data
4d549c0d5f0562b68db26aea97b5f5d705202746 evals/registry/data
54b171d65d839b03aaa4049834d7f00bf0f0f9b5 evals/registry/data
5bd79356be9c2446789520a4fafb676e7a969a10 evals/registry/data
5dd825a11bc260546d52079cefd2fb7e3b02dfc8 evals/registry/data
c5b41e241a86ba9fae48f993c7eff452f8f993a1 evals/registry/data
d3237c1bd488c13c0e58eebe173cbbcc36e044ba evals/registry/data
557320cbc6868f7ca3ae23f0b5caa6870f86305e evals/registry/data/2d_movement
56f76a1b419d426178d739171e7752e1c7307abb evals/registry/data/2d_movement/samples.jsonl
b21648f582376a9188951b6b835e2767ae4334af evals/registry/data/3d_globe_movement
abac30e955da8ade5dc637a1c42054d96df6dfb2 evals/registry/data/3d_globe_movement/samples.jsonl
87437555e62e6f4b8d0bf025a1f0d21dceef1eba evals/registry/data/3d_object_manipulation
75a58a62e841bff40286538273048ca985e6befe evals/registry/data/3d_object_manipulation/samples.jsonl
830c7c1b80bd60c62ea86ca666d7f4691dc24c05 evals/registry/data/Chinese_character_riddles
810f1cffcf0878ee863c29c72cb685718cc8b88a evals/registry/data/Chinese_character_riddles/samples.jsonl
59c36f3b49a3d08980b93d631095a41632c02d9a evals/registry/data/GOL
736f456b90edaf3e945d59f7b01b76ddfea896be evals/registry/data/GOL/samples.jsonl
9ae3d4f25cce072b8fc568f1bc6951330cb82c35 evals/registry/data/GPT-model-text-detection
8d8af69ab5c6259c03f6c755ded7207c4090a1cb evals/registry/data/GPT-model-text-detection/samples.jsonl
c572aba913a26416f64d859217d03dd422fddb00 evals/registry/data/GPT-model-text-detection/source.md
a639afc942be3dbc99befbbd07cacc00dfca4ff8 evals/registry/data/Japanese_onomatopoeia
ff64a3c20a79612e62554fcb53573431277b2a88 evals/registry/data/Japanese_onomatopoeia/samples.jsonl
d5186a3f3aa463596ca2d2fae57b6a84205b045d evals/registry/data/README.md
710fa9fae163784050f0437d3ea33b8095220136 evals/registry/data/Unfamiliar-Chinese-Character
ceb921ef6eee34af5311dfdd60e1e0dafc9b400a evals/registry/data/Unfamiliar-Chinese-Character/samples.jsonl
8968c4f73bb26ad25ce84b91538874706486a0ec evals/registry/data/ab
11c6e0e7d19a8c669b94288c3c4c1f2d4b79b21b evals/registry/data/ab/samples.jsonl
0d7d6329888e404b0e3b0a5b7d2bf0cae05bc21d evals/registry/data/aba_mrpc_true_false
838c8fb148f99f7384eaae850dfb752a1bf35587 evals/registry/data/aba_mrpc_true_false/samples.jsonl
684d61985abb41088ee80deec4574187e8d768a4 evals/registry/data/abstract-causal-reasoning
7ca4803daf0faad4e2741d34e1a431351dfd0b22 evals/registry/data/abstract-causal-reasoning/symbolic_samples.jsonl
93d0b37b67d4717f4df3ae21e5ec9531da9f433c evals/registry/data/abstract-causal-reasoning/text_samples.jsonl
ccfbab03c38724b3f393e719b456bfcc6fd105b5 evals/registry/data/abstract2title
1c22f2d7c6b9d03afbee584f39199093678536a0 evals/registry/data/abstract2title/samples.jsonl
827feab5c00b2a1c903546bf5f1dd96d1f47c736 evals/registry/data/accounting_audit
057af9142ecd1970e08dffc0d01d8efba89014cd evals/registry/data/accounting_audit/samples.jsonl
a5a4fdacc1edebad29469339d84b81df4c0bf26d evals/registry/data/actors-sequence
d579bf9044b33ad70c4e65052a948568a8d48521 evals/registry/data/actors-sequence/samples.jsonl
e064d88d6993de980b14ed9565c3a1b4ec12c805 evals/registry/data/adultery-state-laws
1a8a96e00fc46663ab0401008868f4c42c0a16d2 evals/registry/data/adultery-state-laws/samples.jsonl
c1375c4d023ae443013a66eae2b8f1136051e117 evals/registry/data/afrikaans-lexicon
cfe0bc05763704fabefa39d18c711b14394c74e1 evals/registry/data/afrikaans-lexicon/samples.jsonl
0348ea53b33dd786ea7037f75283a32358b49337 evals/registry/data/agentclinic
e5b0ae07103faf1533c6c423323be19ad4ead79e evals/registry/data/agentclinic/agentclinic_medqa_extended.jsonl
69a11fe26c57874c542b2244fa30f08323819262 evals/registry/data/agentclinic/agentclinic_nejm_extended.jsonl
0fb0841acae96ced19370a8996381216f0fbdc4b evals/registry/data/agentclinic/convert.py
de16873958a53a5d49cbb472cc4633777a1aa535 evals/registry/data/ai_hospital
0b1e50dc68fadfe9d728d8c0e861603eb666d9b8 evals/registry/data/ai_hospital/patients.json
7af695ecac341e72b8ec9562ade66dcbd2eeaf54 evals/registry/data/aime_evaluation
5a1658bea6f980f00a6479b494da6cbcec3eafb1 evals/registry/data/aime_evaluation/samples.jsonl
a1f8a7e3deaa2624e1798e039890f6a347c1a751 evals/registry/data/albanian-exams-qa
e2e6f71533e969602fb9860dc26a7d3fe9acfa9d evals/registry/data/albanian-exams-qa/samples.jsonl
cc303be54e2371c005b130ffea2a845b38526ce9 evals/registry/data/algebra_word_problems
3000ff1ea4f07a9ad6f7d79d7d01b198dc6d7ea1 evals/registry/data/algebra_word_problems/samples.jsonl
2e04e9b7e28bd18964f83769ebe148b2a88834c5 evals/registry/data/allergen-information
6cf1490f34ed842df8e7c0acb0e3a90cefe6d5e9 evals/registry/data/allergen-information/samples.jsonl
a9120bebf90b3fa5d35a834a52cfe226194fa3e6 evals/registry/data/already_said_that
1804855f51113217289dab04ebf6313a90db82cf evals/registry/data/already_said_that/500_100.jsonl
5a6c138028f71330c9f0257a734fe110b1f4661e evals/registry/data/already_said_that/LICENSE
f9c7568f7301400b2ba931c24e1c7be5c480a6be evals/registry/data/alternate_numeral_systems
8ebaf066d5ddf702b4e07a1310041a91a1a27c94 evals/registry/data/alternate_numeral_systems/samples.jsonl