| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9640831758034026, | |
| "eval_steps": 500, | |
| "global_step": 510, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "all_correct": 0.0625, | |
| "all_wrong": 0.0625, | |
| "completion_length": 95.96875, | |
| "epoch": 0.001890359168241966, | |
| "grad_norm": 2.3602562531627758, | |
| "kl": 0.0, | |
| "learning_rate": 1.999982365744487e-06, | |
| "loss": 0.0, | |
| "reward": 1.514066219329834, | |
| "reward_std": 0.375105082988739, | |
| "rewards/accuracy_reward": 0.5140663385391235, | |
| "rewards/format_reward": 1.0, | |
| "step": 1 | |
| }, | |
| { | |
| "all_correct": 0.0625, | |
| "all_wrong": 0.09375, | |
| "completion_length": 92.3515625, | |
| "epoch": 0.003780718336483932, | |
| "grad_norm": 6.640343536616842, | |
| "kl": 0.0013275146484375, | |
| "learning_rate": 1.999929463599883e-06, | |
| "loss": 0.0001, | |
| "reward": 1.417905330657959, | |
| "reward_std": 0.37063267827033997, | |
| "rewards/accuracy_reward": 0.42181164026260376, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 2 | |
| }, | |
| { | |
| "all_correct": 0.0625, | |
| "all_wrong": 0.125, | |
| "completion_length": 90.35546875, | |
| "epoch": 0.005671077504725898, | |
| "grad_norm": 2.414094297005193, | |
| "kl": 0.00127410888671875, | |
| "learning_rate": 1.9998412954319676e-06, | |
| "loss": 0.0001, | |
| "reward": 1.4140625, | |
| "reward_std": 0.3621976673603058, | |
| "rewards/accuracy_reward": 0.4140625, | |
| "rewards/format_reward": 1.0, | |
| "step": 3 | |
| }, | |
| { | |
| "all_correct": 0.09375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 92.7734375, | |
| "epoch": 0.007561436672967864, | |
| "grad_norm": 2.2244626207569187, | |
| "kl": 0.00150299072265625, | |
| "learning_rate": 1.9997178643503e-06, | |
| "loss": 0.0001, | |
| "reward": 1.408446192741394, | |
| "reward_std": 0.3336002826690674, | |
| "rewards/accuracy_reward": 0.40844619274139404, | |
| "rewards/format_reward": 1.0, | |
| "step": 4 | |
| }, | |
| { | |
| "all_correct": 0.125, | |
| "all_wrong": 0.0625, | |
| "completion_length": 89.6640625, | |
| "epoch": 0.00945179584120983, | |
| "grad_norm": 2.3915039557721607, | |
| "kl": 0.0021209716796875, | |
| "learning_rate": 1.999559174708112e-06, | |
| "loss": 0.0001, | |
| "reward": 1.5034363269805908, | |
| "reward_std": 0.37499552965164185, | |
| "rewards/accuracy_reward": 0.5073425769805908, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 5 | |
| }, | |
| { | |
| "all_correct": 0.03125, | |
| "all_wrong": 0.09375, | |
| "completion_length": 88.1875, | |
| "epoch": 0.011342155009451797, | |
| "grad_norm": 4.082221628547588, | |
| "kl": 0.00335693359375, | |
| "learning_rate": 1.99936523210215e-06, | |
| "loss": 0.0001, | |
| "reward": 1.4816137552261353, | |
| "reward_std": 0.3769935369491577, | |
| "rewards/accuracy_reward": 0.48161375522613525, | |
| "rewards/format_reward": 1.0, | |
| "step": 6 | |
| }, | |
| { | |
| "all_correct": 0.15625, | |
| "all_wrong": 0.03125, | |
| "completion_length": 84.96875, | |
| "epoch": 0.013232514177693762, | |
| "grad_norm": 2.411737580254065, | |
| "kl": 0.00433349609375, | |
| "learning_rate": 1.999136043372481e-06, | |
| "loss": 0.0002, | |
| "reward": 1.6246747970581055, | |
| "reward_std": 0.3329503536224365, | |
| "rewards/accuracy_reward": 0.6246747970581055, | |
| "rewards/format_reward": 1.0, | |
| "step": 7 | |
| }, | |
| { | |
| "all_correct": 0.09375, | |
| "all_wrong": 0.125, | |
| "completion_length": 86.140625, | |
| "epoch": 0.015122873345935728, | |
| "grad_norm": 2.5943017844391107, | |
| "kl": 0.00537109375, | |
| "learning_rate": 1.9988716166022506e-06, | |
| "loss": 0.0002, | |
| "reward": 1.4784858226776123, | |
| "reward_std": 0.3560720384120941, | |
| "rewards/accuracy_reward": 0.4784858524799347, | |
| "rewards/format_reward": 1.0, | |
| "step": 8 | |
| }, | |
| { | |
| "all_correct": 0.0625, | |
| "all_wrong": 0.09375, | |
| "completion_length": 82.1328125, | |
| "epoch": 0.017013232514177693, | |
| "grad_norm": 2.1704796065424508, | |
| "kl": 0.00628662109375, | |
| "learning_rate": 1.998571961117397e-06, | |
| "loss": 0.0003, | |
| "reward": 1.4728260040283203, | |
| "reward_std": 0.3873444199562073, | |
| "rewards/accuracy_reward": 0.47282594442367554, | |
| "rewards/format_reward": 1.0, | |
| "step": 9 | |
| }, | |
| { | |
| "all_correct": 0.125, | |
| "all_wrong": 0.09375, | |
| "completion_length": 90.22265625, | |
| "epoch": 0.01890359168241966, | |
| "grad_norm": 1.9312909564522398, | |
| "kl": 0.0068359375, | |
| "learning_rate": 1.9982370874863233e-06, | |
| "loss": 0.0003, | |
| "reward": 1.5304478406906128, | |
| "reward_std": 0.3415384888648987, | |
| "rewards/accuracy_reward": 0.5343540906906128, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 10 | |
| }, | |
| { | |
| "all_correct": 0.09375, | |
| "all_wrong": 0.0625, | |
| "completion_length": 99.89453125, | |
| "epoch": 0.020793950850661626, | |
| "grad_norm": 2.183103416981468, | |
| "kl": 0.0074462890625, | |
| "learning_rate": 1.9978670075195237e-06, | |
| "loss": 0.0003, | |
| "reward": 1.5692423582077026, | |
| "reward_std": 0.39161738753318787, | |
| "rewards/accuracy_reward": 0.5848673582077026, | |
| "rewards/format_reward": 0.984375, | |
| "step": 11 | |
| }, | |
| { | |
| "all_correct": 0.0, | |
| "all_wrong": 0.0, | |
| "completion_length": 93.22265625, | |
| "epoch": 0.022684310018903593, | |
| "grad_norm": 8.227068776940921, | |
| "kl": 0.00958251953125, | |
| "learning_rate": 1.9974617342691674e-06, | |
| "loss": 0.0004, | |
| "reward": 1.468308687210083, | |
| "reward_std": 0.46158915758132935, | |
| "rewards/accuracy_reward": 0.47612112760543823, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 12 | |
| }, | |
| { | |
| "all_correct": 0.09375, | |
| "all_wrong": 0.0625, | |
| "completion_length": 87.81640625, | |
| "epoch": 0.024574669187145556, | |
| "grad_norm": 3.223495772387027, | |
| "kl": 0.01129150390625, | |
| "learning_rate": 1.997021282028639e-06, | |
| "loss": 0.0005, | |
| "reward": 1.5035626888275146, | |
| "reward_std": 0.3581737279891968, | |
| "rewards/accuracy_reward": 0.5035626888275146, | |
| "rewards/format_reward": 1.0, | |
| "step": 13 | |
| }, | |
| { | |
| "all_correct": 0.21875, | |
| "all_wrong": 0.0625, | |
| "completion_length": 88.83984375, | |
| "epoch": 0.026465028355387523, | |
| "grad_norm": 3.1552889851956594, | |
| "kl": 0.0126953125, | |
| "learning_rate": 1.9965456663320324e-06, | |
| "loss": 0.0005, | |
| "reward": 1.507861852645874, | |
| "reward_std": 0.294893354177475, | |
| "rewards/accuracy_reward": 0.507861852645874, | |
| "rewards/format_reward": 1.0, | |
| "step": 14 | |
| }, | |
| { | |
| "all_correct": 0.03125, | |
| "all_wrong": 0.125, | |
| "completion_length": 92.70703125, | |
| "epoch": 0.02835538752362949, | |
| "grad_norm": 2.6633455753050947, | |
| "kl": 0.011962890625, | |
| "learning_rate": 1.996034903953606e-06, | |
| "loss": 0.0005, | |
| "reward": 1.403101921081543, | |
| "reward_std": 0.35901227593421936, | |
| "rewards/accuracy_reward": 0.4031018912792206, | |
| "rewards/format_reward": 1.0, | |
| "step": 15 | |
| }, | |
| { | |
| "all_correct": 0.0625, | |
| "all_wrong": 0.0, | |
| "completion_length": 79.59765625, | |
| "epoch": 0.030245746691871456, | |
| "grad_norm": 4.571581966102541, | |
| "kl": 0.0162353515625, | |
| "learning_rate": 1.9954890129071873e-06, | |
| "loss": 0.0007, | |
| "reward": 1.5757702589035034, | |
| "reward_std": 0.3923387825489044, | |
| "rewards/accuracy_reward": 0.5757702589035034, | |
| "rewards/format_reward": 1.0, | |
| "step": 16 | |
| }, | |
| { | |
| "all_correct": 0.09375, | |
| "all_wrong": 0.03125, | |
| "completion_length": 82.7890625, | |
| "epoch": 0.03213610586011342, | |
| "grad_norm": 2.393421904457873, | |
| "kl": 0.01361083984375, | |
| "learning_rate": 1.9949080124455415e-06, | |
| "loss": 0.0005, | |
| "reward": 1.5704209804534912, | |
| "reward_std": 0.3697139024734497, | |
| "rewards/accuracy_reward": 0.5704209804534912, | |
| "rewards/format_reward": 1.0, | |
| "step": 17 | |
| }, | |
| { | |
| "all_correct": 0.15625, | |
| "all_wrong": 0.125, | |
| "completion_length": 88.5234375, | |
| "epoch": 0.034026465028355386, | |
| "grad_norm": 2.8917838645260643, | |
| "kl": 0.0155029296875, | |
| "learning_rate": 1.9942919230596897e-06, | |
| "loss": 0.0006, | |
| "reward": 1.5118800401687622, | |
| "reward_std": 0.3223443031311035, | |
| "rewards/accuracy_reward": 0.5196925401687622, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 18 | |
| }, | |
| { | |
| "all_correct": 0.0625, | |
| "all_wrong": 0.09375, | |
| "completion_length": 84.2421875, | |
| "epoch": 0.035916824196597356, | |
| "grad_norm": 5.108847500395768, | |
| "kl": 0.0186767578125, | |
| "learning_rate": 1.9936407664781867e-06, | |
| "loss": 0.0007, | |
| "reward": 1.5046335458755493, | |
| "reward_std": 0.3630630373954773, | |
| "rewards/accuracy_reward": 0.5046335458755493, | |
| "rewards/format_reward": 1.0, | |
| "step": 19 | |
| }, | |
| { | |
| "all_correct": 0.125, | |
| "all_wrong": 0.09375, | |
| "completion_length": 84.87890625, | |
| "epoch": 0.03780718336483932, | |
| "grad_norm": 2.1872918469896065, | |
| "kl": 0.0181884765625, | |
| "learning_rate": 1.992954565666356e-06, | |
| "loss": 0.0007, | |
| "reward": 1.5272603034973145, | |
| "reward_std": 0.32879531383514404, | |
| "rewards/accuracy_reward": 0.5350728034973145, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 20 | |
| }, | |
| { | |
| "all_correct": 0.09375, | |
| "all_wrong": 0.0625, | |
| "completion_length": 79.5859375, | |
| "epoch": 0.03969754253308128, | |
| "grad_norm": 3.3985079753937026, | |
| "kl": 0.0186767578125, | |
| "learning_rate": 1.9922333448254785e-06, | |
| "loss": 0.0007, | |
| "reward": 1.4413049221038818, | |
| "reward_std": 0.3449176847934723, | |
| "rewards/accuracy_reward": 0.44130486249923706, | |
| "rewards/format_reward": 1.0, | |
| "step": 21 | |
| }, | |
| { | |
| "all_correct": 0.09375, | |
| "all_wrong": 0.0625, | |
| "completion_length": 87.44921875, | |
| "epoch": 0.04158790170132325, | |
| "grad_norm": 2.1124500662245302, | |
| "kl": 0.0181884765625, | |
| "learning_rate": 1.9914771293919394e-06, | |
| "loss": 0.0007, | |
| "reward": 1.5039958953857422, | |
| "reward_std": 0.36595243215560913, | |
| "rewards/accuracy_reward": 0.5039960145950317, | |
| "rewards/format_reward": 1.0, | |
| "step": 22 | |
| }, | |
| { | |
| "all_correct": 0.125, | |
| "all_wrong": 0.1875, | |
| "completion_length": 90.96484375, | |
| "epoch": 0.043478260869565216, | |
| "grad_norm": 2.57988777327046, | |
| "kl": 0.020263671875, | |
| "learning_rate": 1.9906859460363304e-06, | |
| "loss": 0.0008, | |
| "reward": 1.406567931175232, | |
| "reward_std": 0.32273876667022705, | |
| "rewards/accuracy_reward": 0.41828668117523193, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 23 | |
| }, | |
| { | |
| "all_correct": 0.125, | |
| "all_wrong": 0.0625, | |
| "completion_length": 81.42578125, | |
| "epoch": 0.045368620037807186, | |
| "grad_norm": 2.877431959109021, | |
| "kl": 0.0211181640625, | |
| "learning_rate": 1.9898598226625114e-06, | |
| "loss": 0.0008, | |
| "reward": 1.5652744770050049, | |
| "reward_std": 0.33840304613113403, | |
| "rewards/accuracy_reward": 0.5652744770050049, | |
| "rewards/format_reward": 1.0, | |
| "step": 24 | |
| }, | |
| { | |
| "all_correct": 0.0, | |
| "all_wrong": 0.0625, | |
| "completion_length": 82.98046875, | |
| "epoch": 0.04725897920604915, | |
| "grad_norm": 2.669500830918649, | |
| "kl": 0.01953125, | |
| "learning_rate": 1.9889987884066234e-06, | |
| "loss": 0.0008, | |
| "reward": 1.457775354385376, | |
| "reward_std": 0.3964886963367462, | |
| "rewards/accuracy_reward": 0.4577752947807312, | |
| "rewards/format_reward": 1.0, | |
| "step": 25 | |
| }, | |
| { | |
| "all_correct": 0.03125, | |
| "all_wrong": 0.125, | |
| "completion_length": 90.69921875, | |
| "epoch": 0.04914933837429111, | |
| "grad_norm": 2.1403683676993577, | |
| "kl": 0.01806640625, | |
| "learning_rate": 1.9881028736360623e-06, | |
| "loss": 0.0007, | |
| "reward": 1.3968093395233154, | |
| "reward_std": 0.3421282172203064, | |
| "rewards/accuracy_reward": 0.3968093991279602, | |
| "rewards/format_reward": 1.0, | |
| "step": 26 | |
| }, | |
| { | |
| "all_correct": 0.0625, | |
| "all_wrong": 0.0625, | |
| "completion_length": 86.796875, | |
| "epoch": 0.05103969754253308, | |
| "grad_norm": 2.435390719204286, | |
| "kl": 0.0238037109375, | |
| "learning_rate": 1.9871721099484077e-06, | |
| "loss": 0.001, | |
| "reward": 1.4553546905517578, | |
| "reward_std": 0.3946911692619324, | |
| "rewards/accuracy_reward": 0.4553546607494354, | |
| "rewards/format_reward": 1.0, | |
| "step": 27 | |
| }, | |
| { | |
| "all_correct": 0.03125, | |
| "all_wrong": 0.09375, | |
| "completion_length": 89.609375, | |
| "epoch": 0.052930056710775046, | |
| "grad_norm": 2.8058570294434158, | |
| "kl": 0.0213623046875, | |
| "learning_rate": 1.98620653017031e-06, | |
| "loss": 0.0009, | |
| "reward": 1.4355332851409912, | |
| "reward_std": 0.3729744553565979, | |
| "rewards/accuracy_reward": 0.4394395351409912, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 28 | |
| }, | |
| { | |
| "all_correct": 0.09375, | |
| "all_wrong": 0.0625, | |
| "completion_length": 89.26953125, | |
| "epoch": 0.054820415879017016, | |
| "grad_norm": 2.5095592118103562, | |
| "kl": 0.0234375, | |
| "learning_rate": 1.9852061683563294e-06, | |
| "loss": 0.0009, | |
| "reward": 1.4201900959014893, | |
| "reward_std": 0.3762331008911133, | |
| "rewards/accuracy_reward": 0.4280025064945221, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 29 | |
| }, | |
| { | |
| "all_correct": 0.0, | |
| "all_wrong": 0.125, | |
| "completion_length": 89.44921875, | |
| "epoch": 0.05671077504725898, | |
| "grad_norm": 2.251874852478182, | |
| "kl": 0.02587890625, | |
| "learning_rate": 1.9841710597877382e-06, | |
| "loss": 0.001, | |
| "reward": 1.482748031616211, | |
| "reward_std": 0.37411996722221375, | |
| "rewards/accuracy_reward": 0.4827480912208557, | |
| "rewards/format_reward": 1.0, | |
| "step": 30 | |
| }, | |
| { | |
| "all_correct": 0.09375, | |
| "all_wrong": 0.125, | |
| "completion_length": 93.671875, | |
| "epoch": 0.05860113421550094, | |
| "grad_norm": 3.0094971920246465, | |
| "kl": 0.0240478515625, | |
| "learning_rate": 1.9831012409712736e-06, | |
| "loss": 0.001, | |
| "reward": 1.5294721126556396, | |
| "reward_std": 0.28737539052963257, | |
| "rewards/accuracy_reward": 0.5294721126556396, | |
| "rewards/format_reward": 1.0, | |
| "step": 31 | |
| }, | |
| { | |
| "all_correct": 0.125, | |
| "all_wrong": 0.0625, | |
| "completion_length": 89.1875, | |
| "epoch": 0.06049149338374291, | |
| "grad_norm": 2.085921014940215, | |
| "kl": 0.02294921875, | |
| "learning_rate": 1.981996749637853e-06, | |
| "loss": 0.0009, | |
| "reward": 1.4983799457550049, | |
| "reward_std": 0.3365086317062378, | |
| "rewards/accuracy_reward": 0.5100986957550049, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 32 | |
| }, | |
| { | |
| "all_correct": 0.0625, | |
| "all_wrong": 0.09375, | |
| "completion_length": 90.39453125, | |
| "epoch": 0.062381852551984876, | |
| "grad_norm": 2.622758297904014, | |
| "kl": 0.0267333984375, | |
| "learning_rate": 1.9808576247412406e-06, | |
| "loss": 0.0011, | |
| "reward": 1.457702875137329, | |
| "reward_std": 0.397009938955307, | |
| "rewards/accuracy_reward": 0.4577029049396515, | |
| "rewards/format_reward": 1.0, | |
| "step": 33 | |
| }, | |
| { | |
| "all_correct": 0.09375, | |
| "all_wrong": 0.09375, | |
| "completion_length": 89.71875, | |
| "epoch": 0.06427221172022685, | |
| "grad_norm": 2.171724077831634, | |
| "kl": 0.0255126953125, | |
| "learning_rate": 1.979683906456676e-06, | |
| "loss": 0.001, | |
| "reward": 1.5610275268554688, | |
| "reward_std": 0.3770396411418915, | |
| "rewards/accuracy_reward": 0.5688400268554688, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 34 | |
| }, | |
| { | |
| "all_correct": 0.03125, | |
| "all_wrong": 0.125, | |
| "completion_length": 102.4375, | |
| "epoch": 0.0661625708884688, | |
| "grad_norm": 2.4062787127917553, | |
| "kl": 0.0238037109375, | |
| "learning_rate": 1.9784756361794553e-06, | |
| "loss": 0.001, | |
| "reward": 1.4750714302062988, | |
| "reward_std": 0.39490264654159546, | |
| "rewards/accuracy_reward": 0.49069640040397644, | |
| "rewards/format_reward": 0.984375, | |
| "step": 35 | |
| }, | |
| { | |
| "all_correct": 0.0625, | |
| "all_wrong": 0.09375, | |
| "completion_length": 100.83203125, | |
| "epoch": 0.06805293005671077, | |
| "grad_norm": 1.923377019543097, | |
| "kl": 0.0269775390625, | |
| "learning_rate": 1.9772328565234715e-06, | |
| "loss": 0.0011, | |
| "reward": 1.453148603439331, | |
| "reward_std": 0.3995356857776642, | |
| "rewards/accuracy_reward": 0.46877366304397583, | |
| "rewards/format_reward": 0.984375, | |
| "step": 36 | |
| }, | |
| { | |
| "all_correct": 0.125, | |
| "all_wrong": 0.0625, | |
| "completion_length": 93.76953125, | |
| "epoch": 0.06994328922495274, | |
| "grad_norm": 2.5852022147016065, | |
| "kl": 0.033935546875, | |
| "learning_rate": 1.9759556113197133e-06, | |
| "loss": 0.0014, | |
| "reward": 1.5378497838974, | |
| "reward_std": 0.3647596836090088, | |
| "rewards/accuracy_reward": 0.5378497838973999, | |
| "rewards/format_reward": 1.0, | |
| "step": 37 | |
| }, | |
| { | |
| "all_correct": 0.09375, | |
| "all_wrong": 0.03125, | |
| "completion_length": 93.65625, | |
| "epoch": 0.07183364839319471, | |
| "grad_norm": 3.0899596065894617, | |
| "kl": 0.0311279296875, | |
| "learning_rate": 1.974643945614717e-06, | |
| "loss": 0.0012, | |
| "reward": 1.5544549226760864, | |
| "reward_std": 0.38801953196525574, | |
| "rewards/accuracy_reward": 0.5661737322807312, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 38 | |
| }, | |
| { | |
| "all_correct": 0.09375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 87.58203125, | |
| "epoch": 0.07372400756143667, | |
| "grad_norm": 2.210050723375974, | |
| "kl": 0.031982421875, | |
| "learning_rate": 1.973297905668979e-06, | |
| "loss": 0.0013, | |
| "reward": 1.4015624523162842, | |
| "reward_std": 0.32554763555526733, | |
| "rewards/accuracy_reward": 0.40546876192092896, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 39 | |
| }, | |
| { | |
| "all_correct": 0.0, | |
| "all_wrong": 0.25, | |
| "completion_length": 96.12109375, | |
| "epoch": 0.07561436672967864, | |
| "grad_norm": 3.5246311894519393, | |
| "kl": 0.029052734375, | |
| "learning_rate": 1.971917538955324e-06, | |
| "loss": 0.0012, | |
| "reward": 1.3508296012878418, | |
| "reward_std": 0.28737950325012207, | |
| "rewards/accuracy_reward": 0.350829541683197, | |
| "rewards/format_reward": 1.0, | |
| "step": 40 | |
| }, | |
| { | |
| "all_correct": 0.1875, | |
| "all_wrong": 0.0625, | |
| "completion_length": 89.08984375, | |
| "epoch": 0.07750472589792061, | |
| "grad_norm": 2.4498338590518633, | |
| "kl": 0.03466796875, | |
| "learning_rate": 1.9705028941572306e-06, | |
| "loss": 0.0014, | |
| "reward": 1.5412009954452515, | |
| "reward_std": 0.28274005651474, | |
| "rewards/accuracy_reward": 0.5412009358406067, | |
| "rewards/format_reward": 1.0, | |
| "step": 41 | |
| }, | |
| { | |
| "all_correct": 0.09375, | |
| "all_wrong": 0.09375, | |
| "completion_length": 92.94921875, | |
| "epoch": 0.07939508506616257, | |
| "grad_norm": 2.3904089625119402, | |
| "kl": 0.03369140625, | |
| "learning_rate": 1.9690540211671144e-06, | |
| "loss": 0.0013, | |
| "reward": 1.5241187810897827, | |
| "reward_std": 0.3563092350959778, | |
| "rewards/accuracy_reward": 0.5241187810897827, | |
| "rewards/format_reward": 1.0, | |
| "step": 42 | |
| }, | |
| { | |
| "all_correct": 0.09375, | |
| "all_wrong": 0.09375, | |
| "completion_length": 94.828125, | |
| "epoch": 0.08128544423440454, | |
| "grad_norm": 1.9707044514802314, | |
| "kl": 0.036865234375, | |
| "learning_rate": 1.9675709710845685e-06, | |
| "loss": 0.0015, | |
| "reward": 1.506849765777588, | |
| "reward_std": 0.37673529982566833, | |
| "rewards/accuracy_reward": 0.5263809561729431, | |
| "rewards/format_reward": 0.98046875, | |
| "step": 43 | |
| }, | |
| { | |
| "all_correct": 0.09375, | |
| "all_wrong": 0.125, | |
| "completion_length": 95.9140625, | |
| "epoch": 0.0831758034026465, | |
| "grad_norm": 2.2780949740065326, | |
| "kl": 0.037109375, | |
| "learning_rate": 1.966053796214561e-06, | |
| "loss": 0.0015, | |
| "reward": 1.53428316116333, | |
| "reward_std": 0.30121245980262756, | |
| "rewards/accuracy_reward": 0.5342831015586853, | |
| "rewards/format_reward": 1.0, | |
| "step": 44 | |
| }, | |
| { | |
| "all_correct": 0.09375, | |
| "all_wrong": 0.0, | |
| "completion_length": 93.00390625, | |
| "epoch": 0.08506616257088846, | |
| "grad_norm": 2.5387336771715483, | |
| "kl": 0.033935546875, | |
| "learning_rate": 1.9645025500655906e-06, | |
| "loss": 0.0014, | |
| "reward": 1.6001973152160645, | |
| "reward_std": 0.3812434673309326, | |
| "rewards/accuracy_reward": 0.6001973152160645, | |
| "rewards/format_reward": 1.0, | |
| "step": 45 | |
| }, | |
| { | |
| "all_correct": 0.15625, | |
| "all_wrong": 0.125, | |
| "completion_length": 95.98046875, | |
| "epoch": 0.08695652173913043, | |
| "grad_norm": 1.9377592683430596, | |
| "kl": 0.0341796875, | |
| "learning_rate": 1.9629172873477994e-06, | |
| "loss": 0.0014, | |
| "reward": 1.5135773420333862, | |
| "reward_std": 0.31737208366394043, | |
| "rewards/accuracy_reward": 0.5174835920333862, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 46 | |
| }, | |
| { | |
| "all_correct": 0.03125, | |
| "all_wrong": 0.125, | |
| "completion_length": 100.23828125, | |
| "epoch": 0.0888468809073724, | |
| "grad_norm": 2.869116193500619, | |
| "kl": 0.036865234375, | |
| "learning_rate": 1.9612980639710424e-06, | |
| "loss": 0.0015, | |
| "reward": 1.4429457187652588, | |
| "reward_std": 0.3532693684101105, | |
| "rewards/accuracy_reward": 0.4468519687652588, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 47 | |
| }, | |
| { | |
| "all_correct": 0.09375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 92.6640625, | |
| "epoch": 0.09073724007561437, | |
| "grad_norm": 2.613636479936795, | |
| "kl": 0.0361328125, | |
| "learning_rate": 1.959644937042918e-06, | |
| "loss": 0.0014, | |
| "reward": 1.4310598373413086, | |
| "reward_std": 0.3048115074634552, | |
| "rewards/accuracy_reward": 0.43887221813201904, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 48 | |
| }, | |
| { | |
| "all_correct": 0.0625, | |
| "all_wrong": 0.0625, | |
| "completion_length": 94.671875, | |
| "epoch": 0.09262759924385633, | |
| "grad_norm": 2.3402731827939296, | |
| "kl": 0.0361328125, | |
| "learning_rate": 1.957957964866751e-06, | |
| "loss": 0.0014, | |
| "reward": 1.512986660003662, | |
| "reward_std": 0.3828140199184418, | |
| "rewards/accuracy_reward": 0.5168927907943726, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 49 | |
| }, | |
| { | |
| "all_correct": 0.09375, | |
| "all_wrong": 0.0625, | |
| "completion_length": 98.546875, | |
| "epoch": 0.0945179584120983, | |
| "grad_norm": 2.244807148397354, | |
| "kl": 0.03857421875, | |
| "learning_rate": 1.956237206939538e-06, | |
| "loss": 0.0015, | |
| "reward": 1.47536301612854, | |
| "reward_std": 0.400837242603302, | |
| "rewards/accuracy_reward": 0.47926920652389526, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 50 | |
| }, | |
| { | |
| "all_correct": 0.125, | |
| "all_wrong": 0.09375, | |
| "completion_length": 90.328125, | |
| "epoch": 0.09640831758034027, | |
| "grad_norm": 2.3971385643088348, | |
| "kl": 0.04248046875, | |
| "learning_rate": 1.9544827239498494e-06, | |
| "loss": 0.0017, | |
| "reward": 1.5585781335830688, | |
| "reward_std": 0.37756532430648804, | |
| "rewards/accuracy_reward": 0.5624843835830688, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 51 | |
| }, | |
| { | |
| "all_correct": 0.15625, | |
| "all_wrong": 0.09375, | |
| "completion_length": 84.60546875, | |
| "epoch": 0.09829867674858223, | |
| "grad_norm": 2.2709012051170987, | |
| "kl": 0.038330078125, | |
| "learning_rate": 1.952694577775688e-06, | |
| "loss": 0.0015, | |
| "reward": 1.4947199821472168, | |
| "reward_std": 0.28436505794525146, | |
| "rewards/accuracy_reward": 0.4947200417518616, | |
| "rewards/format_reward": 1.0, | |
| "step": 52 | |
| }, | |
| { | |
| "all_correct": 0.1875, | |
| "all_wrong": 0.15625, | |
| "completion_length": 90.6953125, | |
| "epoch": 0.1001890359168242, | |
| "grad_norm": 2.3353621567696172, | |
| "kl": 0.040283203125, | |
| "learning_rate": 1.950872831482306e-06, | |
| "loss": 0.0016, | |
| "reward": 1.5374271869659424, | |
| "reward_std": 0.2822425663471222, | |
| "rewards/accuracy_reward": 0.5374271869659424, | |
| "rewards/format_reward": 1.0, | |
| "step": 53 | |
| }, | |
| { | |
| "all_correct": 0.15625, | |
| "all_wrong": 0.0625, | |
| "completion_length": 85.8828125, | |
| "epoch": 0.10207939508506617, | |
| "grad_norm": 2.686467954946551, | |
| "kl": 0.044189453125, | |
| "learning_rate": 1.949017549319983e-06, | |
| "loss": 0.0018, | |
| "reward": 1.546825647354126, | |
| "reward_std": 0.2987156808376312, | |
| "rewards/accuracy_reward": 0.546825647354126, | |
| "rewards/format_reward": 1.0, | |
| "step": 54 | |
| }, | |
| { | |
| "all_correct": 0.09375, | |
| "all_wrong": 0.0625, | |
| "completion_length": 98.734375, | |
| "epoch": 0.10396975425330812, | |
| "grad_norm": 1.9504297926987135, | |
| "kl": 0.0390625, | |
| "learning_rate": 1.947128796721759e-06, | |
| "loss": 0.0016, | |
| "reward": 1.6038429737091064, | |
| "reward_std": 0.3754524886608124, | |
| "rewards/accuracy_reward": 0.6077491641044617, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 55 | |
| }, | |
| { | |
| "all_correct": 0.125, | |
| "all_wrong": 0.09375, | |
| "completion_length": 93.5859375, | |
| "epoch": 0.10586011342155009, | |
| "grad_norm": 3.287003319255905, | |
| "kl": 0.041748046875, | |
| "learning_rate": 1.9452066403011253e-06, | |
| "loss": 0.0017, | |
| "reward": 1.4696145057678223, | |
| "reward_std": 0.34197893738746643, | |
| "rewards/accuracy_reward": 0.47352084517478943, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 56 | |
| }, | |
| { | |
| "all_correct": 0.21875, | |
| "all_wrong": 0.09375, | |
| "completion_length": 94.36328125, | |
| "epoch": 0.10775047258979206, | |
| "grad_norm": 3.2939637852247685, | |
| "kl": 0.04345703125, | |
| "learning_rate": 1.9432511478496766e-06, | |
| "loss": 0.0017, | |
| "reward": 1.5146770477294922, | |
| "reward_std": 0.26444536447525024, | |
| "rewards/accuracy_reward": 0.514677107334137, | |
| "rewards/format_reward": 1.0, | |
| "step": 57 | |
| }, | |
| { | |
| "all_correct": 0.1875, | |
| "all_wrong": 0.15625, | |
| "completion_length": 92.765625, | |
| "epoch": 0.10964083175803403, | |
| "grad_norm": 2.1754630667582413, | |
| "kl": 0.046630859375, | |
| "learning_rate": 1.9412623883347206e-06, | |
| "loss": 0.0019, | |
| "reward": 1.5093607902526855, | |
| "reward_std": 0.26719433069229126, | |
| "rewards/accuracy_reward": 0.5171732902526855, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 58 | |
| }, | |
| { | |
| "all_correct": 0.125, | |
| "all_wrong": 0.09375, | |
| "completion_length": 89.7890625, | |
| "epoch": 0.11153119092627599, | |
| "grad_norm": 3.158327945972579, | |
| "kl": 0.042236328125, | |
| "learning_rate": 1.939240431896844e-06, | |
| "loss": 0.0017, | |
| "reward": 1.4645804166793823, | |
| "reward_std": 0.333609402179718, | |
| "rewards/accuracy_reward": 0.46848660707473755, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 59 | |
| }, | |
| { | |
| "all_correct": 0.125, | |
| "all_wrong": 0.1875, | |
| "completion_length": 92.24609375, | |
| "epoch": 0.11342155009451796, | |
| "grad_norm": 4.067057374654062, | |
| "kl": 0.04736328125, | |
| "learning_rate": 1.937185349847439e-06, | |
| "loss": 0.0019, | |
| "reward": 1.4154318571090698, | |
| "reward_std": 0.28358522057533264, | |
| "rewards/accuracy_reward": 0.41543182730674744, | |
| "rewards/format_reward": 1.0, | |
| "step": 60 | |
| }, | |
| { | |
| "all_correct": 0.1875, | |
| "all_wrong": 0.21875, | |
| "completion_length": 85.125, | |
| "epoch": 0.11531190926275993, | |
| "grad_norm": 10.445979366082359, | |
| "kl": 0.048095703125, | |
| "learning_rate": 1.9350972146661903e-06, | |
| "loss": 0.0019, | |
| "reward": 1.5046515464782715, | |
| "reward_std": 0.24698607623577118, | |
| "rewards/accuracy_reward": 0.5085577964782715, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 61 | |
| }, | |
| { | |
| "all_correct": 0.15625, | |
| "all_wrong": 0.1875, | |
| "completion_length": 87.96484375, | |
| "epoch": 0.11720226843100189, | |
| "grad_norm": 4.779328158469133, | |
| "kl": 0.04736328125, | |
| "learning_rate": 1.9329760999985165e-06, | |
| "loss": 0.0019, | |
| "reward": 1.4373040199279785, | |
| "reward_std": 0.29254239797592163, | |
| "rewards/accuracy_reward": 0.4373040795326233, | |
| "rewards/format_reward": 1.0, | |
| "step": 62 | |
| }, | |
| { | |
| "all_correct": 0.1875, | |
| "all_wrong": 0.125, | |
| "completion_length": 85.94140625, | |
| "epoch": 0.11909262759924386, | |
| "grad_norm": 2.355263239902568, | |
| "kl": 0.052490234375, | |
| "learning_rate": 1.9308220806529737e-06, | |
| "loss": 0.0021, | |
| "reward": 1.4870660305023193, | |
| "reward_std": 0.3119330108165741, | |
| "rewards/accuracy_reward": 0.49097222089767456, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 63 | |
| }, | |
| { | |
| "all_correct": 0.1875, | |
| "all_wrong": 0.09375, | |
| "completion_length": 82.74609375, | |
| "epoch": 0.12098298676748583, | |
| "grad_norm": 2.4735239418793604, | |
| "kl": 0.050048828125, | |
| "learning_rate": 1.9286352325986163e-06, | |
| "loss": 0.002, | |
| "reward": 1.5674299001693726, | |
| "reward_std": 0.2720338702201843, | |
| "rewards/accuracy_reward": 0.5791486501693726, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 64 | |
| }, | |
| { | |
| "all_correct": 0.15625, | |
| "all_wrong": 0.09375, | |
| "completion_length": 84.96875, | |
| "epoch": 0.12287334593572778, | |
| "grad_norm": 3.2309139052967457, | |
| "kl": 0.05517578125, | |
| "learning_rate": 1.9264156329623195e-06, | |
| "loss": 0.0022, | |
| "reward": 1.5520949363708496, | |
| "reward_std": 0.2912652790546417, | |
| "rewards/accuracy_reward": 0.5520949363708496, | |
| "rewards/format_reward": 1.0, | |
| "step": 65 | |
| }, | |
| { | |
| "all_correct": 0.21875, | |
| "all_wrong": 0.125, | |
| "completion_length": 86.56640625, | |
| "epoch": 0.12476370510396975, | |
| "grad_norm": 3.426837800551729, | |
| "kl": 0.05126953125, | |
| "learning_rate": 1.9241633600260575e-06, | |
| "loss": 0.0021, | |
| "reward": 1.5658715963363647, | |
| "reward_std": 0.2718903720378876, | |
| "rewards/accuracy_reward": 0.5658715963363647, | |
| "rewards/format_reward": 1.0, | |
| "step": 66 | |
| }, | |
| { | |
| "all_correct": 0.15625, | |
| "all_wrong": 0.125, | |
| "completion_length": 85.89453125, | |
| "epoch": 0.1266540642722117, | |
| "grad_norm": 5.040208361814542, | |
| "kl": 0.046630859375, | |
| "learning_rate": 1.921878493224143e-06, | |
| "loss": 0.0019, | |
| "reward": 1.4924273490905762, | |
| "reward_std": 0.2940727174282074, | |
| "rewards/accuracy_reward": 0.49633359909057617, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 67 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.125, | |
| "completion_length": 78.671875, | |
| "epoch": 0.1285444234404537, | |
| "grad_norm": 2.2476945314714643, | |
| "kl": 0.050048828125, | |
| "learning_rate": 1.9195611131404267e-06, | |
| "loss": 0.002, | |
| "reward": 1.5983318090438843, | |
| "reward_std": 0.21439965069293976, | |
| "rewards/accuracy_reward": 0.5983318090438843, | |
| "rewards/format_reward": 1.0, | |
| "step": 68 | |
| }, | |
| { | |
| "all_correct": 0.15625, | |
| "all_wrong": 0.25, | |
| "completion_length": 87.28515625, | |
| "epoch": 0.13043478260869565, | |
| "grad_norm": 1.8603814885894778, | |
| "kl": 0.043701171875, | |
| "learning_rate": 1.9172113015054528e-06, | |
| "loss": 0.0017, | |
| "reward": 1.3991045951843262, | |
| "reward_std": 0.22454139590263367, | |
| "rewards/accuracy_reward": 0.4030107259750366, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 69 | |
| }, | |
| { | |
| "all_correct": 0.1875, | |
| "all_wrong": 0.09375, | |
| "completion_length": 85.96484375, | |
| "epoch": 0.1323251417769376, | |
| "grad_norm": 3.4308816795426536, | |
| "kl": 0.0537109375, | |
| "learning_rate": 1.9148291411935796e-06, | |
| "loss": 0.0022, | |
| "reward": 1.5655429363250732, | |
| "reward_std": 0.3076818287372589, | |
| "rewards/accuracy_reward": 0.5655430555343628, | |
| "rewards/format_reward": 1.0, | |
| "step": 70 | |
| }, | |
| { | |
| "all_correct": 0.15625, | |
| "all_wrong": 0.09375, | |
| "completion_length": 91.11328125, | |
| "epoch": 0.1342155009451796, | |
| "grad_norm": 1.959995440169171, | |
| "kl": 0.0517578125, | |
| "learning_rate": 1.9124147162200534e-06, | |
| "loss": 0.0021, | |
| "reward": 1.4995213747024536, | |
| "reward_std": 0.3407973051071167, | |
| "rewards/accuracy_reward": 0.5073338747024536, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 71 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.15625, | |
| "completion_length": 79.31640625, | |
| "epoch": 0.13610586011342155, | |
| "grad_norm": 1.8897107714275212, | |
| "kl": 0.053466796875, | |
| "learning_rate": 1.9099681117380486e-06, | |
| "loss": 0.0021, | |
| "reward": 1.634920597076416, | |
| "reward_std": 0.21926391124725342, | |
| "rewards/accuracy_reward": 0.6349206566810608, | |
| "rewards/format_reward": 1.0, | |
| "step": 72 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.09375, | |
| "completion_length": 83.95703125, | |
| "epoch": 0.13799621928166353, | |
| "grad_norm": 1.6792021771564014, | |
| "kl": 0.052490234375, | |
| "learning_rate": 1.907489414035662e-06, | |
| "loss": 0.0021, | |
| "reward": 1.7418066263198853, | |
| "reward_std": 0.20898818969726562, | |
| "rewards/accuracy_reward": 0.7457128763198853, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 73 | |
| }, | |
| { | |
| "all_correct": 0.21875, | |
| "all_wrong": 0.25, | |
| "completion_length": 82.68359375, | |
| "epoch": 0.13988657844990549, | |
| "grad_norm": 18.661291363199766, | |
| "kl": 0.0556640625, | |
| "learning_rate": 1.9049787105328714e-06, | |
| "loss": 0.0022, | |
| "reward": 1.5272233486175537, | |
| "reward_std": 0.22412577271461487, | |
| "rewards/accuracy_reward": 0.5311296582221985, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 74 | |
| }, | |
| { | |
| "all_correct": 0.21875, | |
| "all_wrong": 0.125, | |
| "completion_length": 86.87890625, | |
| "epoch": 0.14177693761814744, | |
| "grad_norm": 1.968107409229428, | |
| "kl": 0.05517578125, | |
| "learning_rate": 1.9024360897784505e-06, | |
| "loss": 0.0022, | |
| "reward": 1.53652024269104, | |
| "reward_std": 0.2906913161277771, | |
| "rewards/accuracy_reward": 0.5443326234817505, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 75 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.15625, | |
| "completion_length": 89.5546875, | |
| "epoch": 0.14366729678638943, | |
| "grad_norm": 1.9463441104137431, | |
| "kl": 0.05322265625, | |
| "learning_rate": 1.8998616414468477e-06, | |
| "loss": 0.0021, | |
| "reward": 1.5365304946899414, | |
| "reward_std": 0.22610870003700256, | |
| "rewards/accuracy_reward": 0.5365304350852966, | |
| "rewards/format_reward": 1.0, | |
| "step": 76 | |
| }, | |
| { | |
| "all_correct": 0.1875, | |
| "all_wrong": 0.1875, | |
| "completion_length": 89.98046875, | |
| "epoch": 0.14555765595463138, | |
| "grad_norm": 4.159641747444261, | |
| "kl": 0.0517578125, | |
| "learning_rate": 1.897255456335022e-06, | |
| "loss": 0.0021, | |
| "reward": 1.530354619026184, | |
| "reward_std": 0.2696187496185303, | |
| "rewards/accuracy_reward": 0.5303546190261841, | |
| "rewards/format_reward": 1.0, | |
| "step": 77 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.125, | |
| "completion_length": 86.078125, | |
| "epoch": 0.14744801512287334, | |
| "grad_norm": 1.8829618370755878, | |
| "kl": 0.054443359375, | |
| "learning_rate": 1.894617626359242e-06, | |
| "loss": 0.0022, | |
| "reward": 1.6262216567993164, | |
| "reward_std": 0.2085573971271515, | |
| "rewards/accuracy_reward": 0.6262217164039612, | |
| "rewards/format_reward": 1.0, | |
| "step": 78 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.125, | |
| "completion_length": 92.19921875, | |
| "epoch": 0.14933837429111532, | |
| "grad_norm": 3.1281418285199996, | |
| "kl": 0.047607421875, | |
| "learning_rate": 1.8919482445518434e-06, | |
| "loss": 0.0019, | |
| "reward": 1.5566154718399048, | |
| "reward_std": 0.2710142731666565, | |
| "rewards/accuracy_reward": 0.5566154718399048, | |
| "rewards/format_reward": 1.0, | |
| "step": 79 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.1875, | |
| "completion_length": 97.9453125, | |
| "epoch": 0.15122873345935728, | |
| "grad_norm": 1.8453083650465387, | |
| "kl": 0.0498046875, | |
| "learning_rate": 1.8892474050579476e-06, | |
| "loss": 0.002, | |
| "reward": 1.526172399520874, | |
| "reward_std": 0.15693798661231995, | |
| "rewards/accuracy_reward": 0.5261724591255188, | |
| "rewards/format_reward": 1.0, | |
| "step": 80 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 96.15625, | |
| "epoch": 0.15311909262759923, | |
| "grad_norm": 1.618450951037405, | |
| "kl": 0.052734375, | |
| "learning_rate": 1.8865152031321425e-06, | |
| "loss": 0.0021, | |
| "reward": 1.5617804527282715, | |
| "reward_std": 0.20894506573677063, | |
| "rewards/accuracy_reward": 0.5656867027282715, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 81 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.21875, | |
| "completion_length": 93.93359375, | |
| "epoch": 0.15500945179584122, | |
| "grad_norm": 3.3619693904490346, | |
| "kl": 0.049560546875, | |
| "learning_rate": 1.8837517351351212e-06, | |
| "loss": 0.002, | |
| "reward": 1.502871036529541, | |
| "reward_std": 0.2329874485731125, | |
| "rewards/accuracy_reward": 0.5028710961341858, | |
| "rewards/format_reward": 1.0, | |
| "step": 82 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.09375, | |
| "completion_length": 100.8671875, | |
| "epoch": 0.15689981096408318, | |
| "grad_norm": 2.0482069336725717, | |
| "kl": 0.05029296875, | |
| "learning_rate": 1.8809570985302861e-06, | |
| "loss": 0.002, | |
| "reward": 1.5919384956359863, | |
| "reward_std": 0.2818424105644226, | |
| "rewards/accuracy_reward": 0.5919384956359863, | |
| "rewards/format_reward": 1.0, | |
| "step": 83 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.1875, | |
| "completion_length": 93.40234375, | |
| "epoch": 0.15879017013232513, | |
| "grad_norm": 1.5003724294387097, | |
| "kl": 0.0498046875, | |
| "learning_rate": 1.8781313918803083e-06, | |
| "loss": 0.002, | |
| "reward": 1.5504703521728516, | |
| "reward_std": 0.22670012712478638, | |
| "rewards/accuracy_reward": 0.5739079117774963, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 84 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 87.7421875, | |
| "epoch": 0.16068052930056712, | |
| "grad_norm": 4.602359933501162, | |
| "kl": 0.054443359375, | |
| "learning_rate": 1.8752747148436542e-06, | |
| "loss": 0.0022, | |
| "reward": 1.5955908298492432, | |
| "reward_std": 0.164137065410614, | |
| "rewards/accuracy_reward": 0.5955909490585327, | |
| "rewards/format_reward": 1.0, | |
| "step": 85 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.25, | |
| "completion_length": 92.83203125, | |
| "epoch": 0.16257088846880907, | |
| "grad_norm": 1.903155543992341, | |
| "kl": 0.052490234375, | |
| "learning_rate": 1.8723871681710694e-06, | |
| "loss": 0.0021, | |
| "reward": 1.4634450674057007, | |
| "reward_std": 0.20022635161876678, | |
| "rewards/accuracy_reward": 0.4634450674057007, | |
| "rewards/format_reward": 1.0, | |
| "step": 86 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.25, | |
| "completion_length": 91.84765625, | |
| "epoch": 0.16446124763705103, | |
| "grad_norm": 1.6427736050864759, | |
| "kl": 0.06640625, | |
| "learning_rate": 1.8694688537020258e-06, | |
| "loss": 0.0027, | |
| "reward": 1.4603149890899658, | |
| "reward_std": 0.1893424689769745, | |
| "rewards/accuracy_reward": 0.468127578496933, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 87 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.28125, | |
| "completion_length": 98.12890625, | |
| "epoch": 0.166351606805293, | |
| "grad_norm": 1.2849844281526226, | |
| "kl": 0.053955078125, | |
| "learning_rate": 1.866519874361129e-06, | |
| "loss": 0.0022, | |
| "reward": 1.493840217590332, | |
| "reward_std": 0.1990819126367569, | |
| "rewards/accuracy_reward": 0.5055589079856873, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 88 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.03125, | |
| "completion_length": 97.5546875, | |
| "epoch": 0.16824196597353497, | |
| "grad_norm": 3.36212198927166, | |
| "kl": 0.051513671875, | |
| "learning_rate": 1.8635403341544897e-06, | |
| "loss": 0.0021, | |
| "reward": 1.6382396221160889, | |
| "reward_std": 0.29065367579460144, | |
| "rewards/accuracy_reward": 0.6421457529067993, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 89 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.21875, | |
| "completion_length": 93.35546875, | |
| "epoch": 0.17013232514177692, | |
| "grad_norm": 1.964853567576083, | |
| "kl": 0.052001953125, | |
| "learning_rate": 1.8605303381660542e-06, | |
| "loss": 0.0021, | |
| "reward": 1.4816901683807373, | |
| "reward_std": 0.2411368191242218, | |
| "rewards/accuracy_reward": 0.48559650778770447, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 90 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 97.84375, | |
| "epoch": 0.1720226843100189, | |
| "grad_norm": 1.7427774741536497, | |
| "kl": 0.048583984375, | |
| "learning_rate": 1.8574899925538995e-06, | |
| "loss": 0.0019, | |
| "reward": 1.594164490699768, | |
| "reward_std": 0.22482213377952576, | |
| "rewards/accuracy_reward": 0.6176020503044128, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 91 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.25, | |
| "completion_length": 96.33203125, | |
| "epoch": 0.17391304347826086, | |
| "grad_norm": 1.178350633076914, | |
| "kl": 0.0478515625, | |
| "learning_rate": 1.8544194045464886e-06, | |
| "loss": 0.0019, | |
| "reward": 1.570425271987915, | |
| "reward_std": 0.15437397360801697, | |
| "rewards/accuracy_reward": 0.5704251527786255, | |
| "rewards/format_reward": 1.0, | |
| "step": 92 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.25, | |
| "completion_length": 96.08984375, | |
| "epoch": 0.17580340264650285, | |
| "grad_norm": 1.961774060919074, | |
| "kl": 0.050048828125, | |
| "learning_rate": 1.8513186824388878e-06, | |
| "loss": 0.002, | |
| "reward": 1.4320415258407593, | |
| "reward_std": 0.18912720680236816, | |
| "rewards/accuracy_reward": 0.44766655564308167, | |
| "rewards/format_reward": 0.984375, | |
| "step": 93 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.25, | |
| "completion_length": 99.69921875, | |
| "epoch": 0.1776937618147448, | |
| "grad_norm": 1.5015570744898201, | |
| "kl": 0.046630859375, | |
| "learning_rate": 1.8481879355889493e-06, | |
| "loss": 0.0019, | |
| "reward": 1.5381855964660645, | |
| "reward_std": 0.2091609686613083, | |
| "rewards/accuracy_reward": 0.5459980964660645, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 94 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.34375, | |
| "completion_length": 93.59765625, | |
| "epoch": 0.17958412098298676, | |
| "grad_norm": 1.5238358848316345, | |
| "kl": 0.0576171875, | |
| "learning_rate": 1.8450272744134533e-06, | |
| "loss": 0.0023, | |
| "reward": 1.4812531471252441, | |
| "reward_std": 0.15344488620758057, | |
| "rewards/accuracy_reward": 0.48125314712524414, | |
| "rewards/format_reward": 1.0, | |
| "step": 95 | |
| }, | |
| { | |
| "all_correct": 0.21875, | |
| "all_wrong": 0.3125, | |
| "completion_length": 101.0390625, | |
| "epoch": 0.18147448015122875, | |
| "grad_norm": 1.3836535628215803, | |
| "kl": 0.053955078125, | |
| "learning_rate": 1.8418368103842122e-06, | |
| "loss": 0.0022, | |
| "reward": 1.4727139472961426, | |
| "reward_std": 0.20200154185295105, | |
| "rewards/accuracy_reward": 0.4766201078891754, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 96 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.125, | |
| "completion_length": 100.37109375, | |
| "epoch": 0.1833648393194707, | |
| "grad_norm": 4.036599697785966, | |
| "kl": 0.04541015625, | |
| "learning_rate": 1.8386166560241431e-06, | |
| "loss": 0.0018, | |
| "reward": 1.5775189399719238, | |
| "reward_std": 0.2264866977930069, | |
| "rewards/accuracy_reward": 0.5775189399719238, | |
| "rewards/format_reward": 1.0, | |
| "step": 97 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.15625, | |
| "completion_length": 91.52734375, | |
| "epoch": 0.18525519848771266, | |
| "grad_norm": 2.004043176233875, | |
| "kl": 0.048095703125, | |
| "learning_rate": 1.835366924903295e-06, | |
| "loss": 0.0019, | |
| "reward": 1.612939476966858, | |
| "reward_std": 0.20541028678417206, | |
| "rewards/accuracy_reward": 0.6129394769668579, | |
| "rewards/format_reward": 1.0, | |
| "step": 98 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.1875, | |
| "completion_length": 86.5625, | |
| "epoch": 0.18714555765595464, | |
| "grad_norm": 1.4852739381285651, | |
| "kl": 0.04736328125, | |
| "learning_rate": 1.8320877316348453e-06, | |
| "loss": 0.0019, | |
| "reward": 1.6532118320465088, | |
| "reward_std": 0.14846470952033997, | |
| "rewards/accuracy_reward": 0.6532118320465088, | |
| "rewards/format_reward": 1.0, | |
| "step": 99 | |
| }, | |
| { | |
| "all_correct": 0.15625, | |
| "all_wrong": 0.125, | |
| "completion_length": 99.05078125, | |
| "epoch": 0.1890359168241966, | |
| "grad_norm": 4.076356880937712, | |
| "kl": 0.043701171875, | |
| "learning_rate": 1.8287791918710584e-06, | |
| "loss": 0.0017, | |
| "reward": 1.478124976158142, | |
| "reward_std": 0.32335546612739563, | |
| "rewards/accuracy_reward": 0.4859375059604645, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 100 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.03125, | |
| "completion_length": 91.46875, | |
| "epoch": 0.19092627599243855, | |
| "grad_norm": 3.6383673386285675, | |
| "kl": 0.04296875, | |
| "learning_rate": 1.8254414222992057e-06, | |
| "loss": 0.0017, | |
| "reward": 1.6602026224136353, | |
| "reward_std": 0.30003082752227783, | |
| "rewards/accuracy_reward": 0.6602025628089905, | |
| "rewards/format_reward": 1.0, | |
| "step": 101 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.0625, | |
| "completion_length": 90.078125, | |
| "epoch": 0.19281663516068054, | |
| "grad_norm": 2.071351404946499, | |
| "kl": 0.04833984375, | |
| "learning_rate": 1.8220745406374495e-06, | |
| "loss": 0.0019, | |
| "reward": 1.6283621788024902, | |
| "reward_std": 0.2751215100288391, | |
| "rewards/accuracy_reward": 0.6283620595932007, | |
| "rewards/format_reward": 1.0, | |
| "step": 102 | |
| }, | |
| { | |
| "all_correct": 0.21875, | |
| "all_wrong": 0.1875, | |
| "completion_length": 97.75390625, | |
| "epoch": 0.1947069943289225, | |
| "grad_norm": 1.97407172472788, | |
| "kl": 0.044921875, | |
| "learning_rate": 1.8186786656306934e-06, | |
| "loss": 0.0018, | |
| "reward": 1.5005707740783691, | |
| "reward_std": 0.2729160189628601, | |
| "rewards/accuracy_reward": 0.5083833336830139, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 103 | |
| }, | |
| { | |
| "all_correct": 0.21875, | |
| "all_wrong": 0.125, | |
| "completion_length": 90.484375, | |
| "epoch": 0.19659735349716445, | |
| "grad_norm": 2.1911504081514734, | |
| "kl": 0.04345703125, | |
| "learning_rate": 1.8152539170463922e-06, | |
| "loss": 0.0017, | |
| "reward": 1.5098209381103516, | |
| "reward_std": 0.24280044436454773, | |
| "rewards/accuracy_reward": 0.5137272477149963, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 104 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.0625, | |
| "completion_length": 91.6875, | |
| "epoch": 0.19848771266540643, | |
| "grad_norm": 2.1758066744800924, | |
| "kl": 0.048583984375, | |
| "learning_rate": 1.8118004156703295e-06, | |
| "loss": 0.0019, | |
| "reward": 1.666813611984253, | |
| "reward_std": 0.2204323410987854, | |
| "rewards/accuracy_reward": 0.6668134927749634, | |
| "rewards/format_reward": 1.0, | |
| "step": 105 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.15625, | |
| "completion_length": 85.43359375, | |
| "epoch": 0.2003780718336484, | |
| "grad_norm": 2.2920515987360774, | |
| "kl": 0.048828125, | |
| "learning_rate": 1.808318283302356e-06, | |
| "loss": 0.002, | |
| "reward": 1.6315945386886597, | |
| "reward_std": 0.17060711979866028, | |
| "rewards/accuracy_reward": 0.6315945386886597, | |
| "rewards/format_reward": 1.0, | |
| "step": 106 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.09375, | |
| "completion_length": 92.27734375, | |
| "epoch": 0.20226843100189035, | |
| "grad_norm": 1.6468108640255814, | |
| "kl": 0.048583984375, | |
| "learning_rate": 1.8048076427520956e-06, | |
| "loss": 0.0019, | |
| "reward": 1.6242039203643799, | |
| "reward_std": 0.2274279147386551, | |
| "rewards/accuracy_reward": 0.6281101703643799, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 107 | |
| }, | |
| { | |
| "all_correct": 0.1875, | |
| "all_wrong": 0.34375, | |
| "completion_length": 90.59765625, | |
| "epoch": 0.20415879017013233, | |
| "grad_norm": 1.66499120414633, | |
| "kl": 0.052978515625, | |
| "learning_rate": 1.801268617834614e-06, | |
| "loss": 0.0021, | |
| "reward": 1.4206892251968384, | |
| "reward_std": 0.15304508805274963, | |
| "rewards/accuracy_reward": 0.4206892251968384, | |
| "rewards/format_reward": 1.0, | |
| "step": 108 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.125, | |
| "completion_length": 92.9296875, | |
| "epoch": 0.2060491493383743, | |
| "grad_norm": 1.3046512489220061, | |
| "kl": 0.044677734375, | |
| "learning_rate": 1.7977013333660498e-06, | |
| "loss": 0.0018, | |
| "reward": 1.637601613998413, | |
| "reward_std": 0.17598497867584229, | |
| "rewards/accuracy_reward": 0.6571328639984131, | |
| "rewards/format_reward": 0.98046875, | |
| "step": 109 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 87.953125, | |
| "epoch": 0.20793950850661624, | |
| "grad_norm": 1.9099290554016326, | |
| "kl": 0.04541015625, | |
| "learning_rate": 1.7941059151592145e-06, | |
| "loss": 0.0018, | |
| "reward": 1.579087734222412, | |
| "reward_std": 0.17340317368507385, | |
| "rewards/accuracy_reward": 0.5829939842224121, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 110 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.21875, | |
| "completion_length": 93.56640625, | |
| "epoch": 0.20982986767485823, | |
| "grad_norm": 6.034052106072661, | |
| "kl": 0.046142578125, | |
| "learning_rate": 1.7904824900191555e-06, | |
| "loss": 0.0018, | |
| "reward": 1.5380107164382935, | |
| "reward_std": 0.1255079060792923, | |
| "rewards/accuracy_reward": 0.5380107164382935, | |
| "rewards/format_reward": 1.0, | |
| "step": 111 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.25, | |
| "completion_length": 92.42578125, | |
| "epoch": 0.21172022684310018, | |
| "grad_norm": 1.4881947802094542, | |
| "kl": 0.050537109375, | |
| "learning_rate": 1.786831185738682e-06, | |
| "loss": 0.002, | |
| "reward": 1.5942175388336182, | |
| "reward_std": 0.12322144210338593, | |
| "rewards/accuracy_reward": 0.5981237888336182, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 112 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.1875, | |
| "completion_length": 89.22265625, | |
| "epoch": 0.21361058601134217, | |
| "grad_norm": 3.5770227394508876, | |
| "kl": 0.04931640625, | |
| "learning_rate": 1.7831521310938587e-06, | |
| "loss": 0.002, | |
| "reward": 1.5163066387176514, | |
| "reward_std": 0.1974867880344391, | |
| "rewards/accuracy_reward": 0.5163066387176514, | |
| "rewards/format_reward": 1.0, | |
| "step": 113 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.09375, | |
| "completion_length": 82.1328125, | |
| "epoch": 0.21550094517958412, | |
| "grad_norm": 2.212730582665482, | |
| "kl": 0.0556640625, | |
| "learning_rate": 1.7794454558394657e-06, | |
| "loss": 0.0022, | |
| "reward": 1.6943297386169434, | |
| "reward_std": 0.18354207277297974, | |
| "rewards/accuracy_reward": 0.6943297982215881, | |
| "rewards/format_reward": 1.0, | |
| "step": 114 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.21875, | |
| "completion_length": 90.8203125, | |
| "epoch": 0.21739130434782608, | |
| "grad_norm": 4.361936414639028, | |
| "kl": 0.056396484375, | |
| "learning_rate": 1.7757112907044198e-06, | |
| "loss": 0.0023, | |
| "reward": 1.555484652519226, | |
| "reward_std": 0.1996951699256897, | |
| "rewards/accuracy_reward": 0.5554846525192261, | |
| "rewards/format_reward": 1.0, | |
| "step": 115 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.25, | |
| "completion_length": 87.26171875, | |
| "epoch": 0.21928166351606806, | |
| "grad_norm": 1.8180751664027779, | |
| "kl": 0.0498046875, | |
| "learning_rate": 1.7719497673871651e-06, | |
| "loss": 0.002, | |
| "reward": 1.4978692531585693, | |
| "reward_std": 0.19965368509292603, | |
| "rewards/accuracy_reward": 0.4978693127632141, | |
| "rewards/format_reward": 1.0, | |
| "step": 116 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 93.81640625, | |
| "epoch": 0.22117202268431002, | |
| "grad_norm": 1.8114429952435827, | |
| "kl": 0.047119140625, | |
| "learning_rate": 1.7681610185510283e-06, | |
| "loss": 0.0019, | |
| "reward": 1.6657145023345947, | |
| "reward_std": 0.15193983912467957, | |
| "rewards/accuracy_reward": 0.66962069272995, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 117 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.1875, | |
| "completion_length": 88.76171875, | |
| "epoch": 0.22306238185255198, | |
| "grad_norm": 3.054644149610559, | |
| "kl": 0.049072265625, | |
| "learning_rate": 1.7643451778195394e-06, | |
| "loss": 0.002, | |
| "reward": 1.5918383598327637, | |
| "reward_std": 0.20005470514297485, | |
| "rewards/accuracy_reward": 0.5918383002281189, | |
| "rewards/format_reward": 1.0, | |
| "step": 118 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.125, | |
| "completion_length": 86.08203125, | |
| "epoch": 0.22495274102079396, | |
| "grad_norm": 2.092558777656238, | |
| "kl": 0.054443359375, | |
| "learning_rate": 1.7605023797717194e-06, | |
| "loss": 0.0022, | |
| "reward": 1.6277587413787842, | |
| "reward_std": 0.19753384590148926, | |
| "rewards/accuracy_reward": 0.6277587413787842, | |
| "rewards/format_reward": 1.0, | |
| "step": 119 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 90.5859375, | |
| "epoch": 0.22684310018903592, | |
| "grad_norm": 4.221955954568503, | |
| "kl": 0.051025390625, | |
| "learning_rate": 1.7566327599373336e-06, | |
| "loss": 0.002, | |
| "reward": 1.6072583198547363, | |
| "reward_std": 0.1987745761871338, | |
| "rewards/accuracy_reward": 0.6072583794593811, | |
| "rewards/format_reward": 1.0, | |
| "step": 120 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.125, | |
| "completion_length": 90.83984375, | |
| "epoch": 0.22873345935727787, | |
| "grad_norm": 1.8343630634483405, | |
| "kl": 0.0498046875, | |
| "learning_rate": 1.7527364547921118e-06, | |
| "loss": 0.002, | |
| "reward": 1.6175568103790283, | |
| "reward_std": 0.17853425443172455, | |
| "rewards/accuracy_reward": 0.6175566911697388, | |
| "rewards/format_reward": 1.0, | |
| "step": 121 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.125, | |
| "completion_length": 84.7734375, | |
| "epoch": 0.23062381852551986, | |
| "grad_norm": 1.3989906719734049, | |
| "kl": 0.0615234375, | |
| "learning_rate": 1.748813601752935e-06, | |
| "loss": 0.0025, | |
| "reward": 1.7062667608261108, | |
| "reward_std": 0.1319604218006134, | |
| "rewards/accuracy_reward": 0.7062667012214661, | |
| "rewards/format_reward": 1.0, | |
| "step": 122 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 91.40234375, | |
| "epoch": 0.23251417769376181, | |
| "grad_norm": 1.9445947479760342, | |
| "kl": 0.052978515625, | |
| "learning_rate": 1.7448643391729886e-06, | |
| "loss": 0.0021, | |
| "reward": 1.573242425918579, | |
| "reward_std": 0.2199619710445404, | |
| "rewards/accuracy_reward": 0.5849611759185791, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 123 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.25, | |
| "completion_length": 92.703125, | |
| "epoch": 0.23440453686200377, | |
| "grad_norm": 1.9821554924076896, | |
| "kl": 0.047607421875, | |
| "learning_rate": 1.7408888063368838e-06, | |
| "loss": 0.0019, | |
| "reward": 1.517979621887207, | |
| "reward_std": 0.1772761046886444, | |
| "rewards/accuracy_reward": 0.517979621887207, | |
| "rewards/format_reward": 1.0, | |
| "step": 124 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.1875, | |
| "completion_length": 94.2421875, | |
| "epoch": 0.23629489603024575, | |
| "grad_norm": 1.7860361568391379, | |
| "kl": 0.056640625, | |
| "learning_rate": 1.7368871434557445e-06, | |
| "loss": 0.0023, | |
| "reward": 1.5142911672592163, | |
| "reward_std": 0.24644066393375397, | |
| "rewards/accuracy_reward": 0.5221036672592163, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 125 | |
| }, | |
| { | |
| "all_correct": 0.1875, | |
| "all_wrong": 0.0, | |
| "completion_length": 100.96484375, | |
| "epoch": 0.2381852551984877, | |
| "grad_norm": 2.329904142862184, | |
| "kl": 0.046875, | |
| "learning_rate": 1.7328594916622615e-06, | |
| "loss": 0.0019, | |
| "reward": 1.5455485582351685, | |
| "reward_std": 0.35480332374572754, | |
| "rewards/accuracy_reward": 0.5650798082351685, | |
| "rewards/format_reward": 0.98046875, | |
| "step": 126 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.1875, | |
| "completion_length": 85.55859375, | |
| "epoch": 0.24007561436672967, | |
| "grad_norm": 1.3180884643239135, | |
| "kl": 0.045654296875, | |
| "learning_rate": 1.7288059930057165e-06, | |
| "loss": 0.0018, | |
| "reward": 1.626103401184082, | |
| "reward_std": 0.19042542576789856, | |
| "rewards/accuracy_reward": 0.633915901184082, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 127 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.15625, | |
| "completion_length": 96.4765625, | |
| "epoch": 0.24196597353497165, | |
| "grad_norm": 1.631849621742655, | |
| "kl": 0.05322265625, | |
| "learning_rate": 1.7247267904469723e-06, | |
| "loss": 0.0021, | |
| "reward": 1.5507967472076416, | |
| "reward_std": 0.21631184220314026, | |
| "rewards/accuracy_reward": 0.5586091876029968, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 128 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.25, | |
| "completion_length": 90.7265625, | |
| "epoch": 0.2438563327032136, | |
| "grad_norm": 2.019747189377822, | |
| "kl": 0.054931640625, | |
| "learning_rate": 1.7206220278534285e-06, | |
| "loss": 0.0022, | |
| "reward": 1.5207030773162842, | |
| "reward_std": 0.14799641072750092, | |
| "rewards/accuracy_reward": 0.528515636920929, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 129 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 92.640625, | |
| "epoch": 0.24574669187145556, | |
| "grad_norm": 1.3384988128455364, | |
| "kl": 0.054443359375, | |
| "learning_rate": 1.7164918499939501e-06, | |
| "loss": 0.0022, | |
| "reward": 1.621284008026123, | |
| "reward_std": 0.16135218739509583, | |
| "rewards/accuracy_reward": 0.6330028176307678, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 130 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 93.1015625, | |
| "epoch": 0.24763705103969755, | |
| "grad_norm": 1.687141544181196, | |
| "kl": 0.0576171875, | |
| "learning_rate": 1.712336402533761e-06, | |
| "loss": 0.0023, | |
| "reward": 1.5703125, | |
| "reward_std": 0.19187898933887482, | |
| "rewards/accuracy_reward": 0.5703125, | |
| "rewards/format_reward": 1.0, | |
| "step": 131 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 92.66796875, | |
| "epoch": 0.2495274102079395, | |
| "grad_norm": 79.59867493675014, | |
| "kl": 0.048828125, | |
| "learning_rate": 1.7081558320293053e-06, | |
| "loss": 0.002, | |
| "reward": 1.590255618095398, | |
| "reward_std": 0.1713361293077469, | |
| "rewards/accuracy_reward": 0.5941617488861084, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 132 | |
| }, | |
| { | |
| "all_correct": 0.5, | |
| "all_wrong": 0.09375, | |
| "completion_length": 87.046875, | |
| "epoch": 0.2514177693761815, | |
| "grad_norm": 1.4501119642103175, | |
| "kl": 0.04833984375, | |
| "learning_rate": 1.7039502859230797e-06, | |
| "loss": 0.0019, | |
| "reward": 1.6924138069152832, | |
| "reward_std": 0.1737845093011856, | |
| "rewards/accuracy_reward": 0.6924139261245728, | |
| "rewards/format_reward": 1.0, | |
| "step": 133 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.125, | |
| "completion_length": 83.3828125, | |
| "epoch": 0.2533081285444234, | |
| "grad_norm": 1.753007888815303, | |
| "kl": 0.060791015625, | |
| "learning_rate": 1.699719912538434e-06, | |
| "loss": 0.0024, | |
| "reward": 1.6514040231704712, | |
| "reward_std": 0.15027320384979248, | |
| "rewards/accuracy_reward": 0.6514040231704712, | |
| "rewards/format_reward": 1.0, | |
| "step": 134 | |
| }, | |
| { | |
| "all_correct": 0.1875, | |
| "all_wrong": 0.1875, | |
| "completion_length": 84.77734375, | |
| "epoch": 0.2551984877126654, | |
| "grad_norm": 2.162337902604871, | |
| "kl": 0.05712890625, | |
| "learning_rate": 1.6954648610743384e-06, | |
| "loss": 0.0023, | |
| "reward": 1.534517765045166, | |
| "reward_std": 0.2556450366973877, | |
| "rewards/accuracy_reward": 0.534517765045166, | |
| "rewards/format_reward": 1.0, | |
| "step": 135 | |
| }, | |
| { | |
| "all_correct": 0.21875, | |
| "all_wrong": 0.25, | |
| "completion_length": 91.46484375, | |
| "epoch": 0.2570888468809074, | |
| "grad_norm": 2.2567486633448244, | |
| "kl": 0.059326171875, | |
| "learning_rate": 1.6911852816001217e-06, | |
| "loss": 0.0024, | |
| "reward": 1.4765625, | |
| "reward_std": 0.22168521583080292, | |
| "rewards/accuracy_reward": 0.4765624701976776, | |
| "rewards/format_reward": 1.0, | |
| "step": 136 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.15625, | |
| "completion_length": 95.22265625, | |
| "epoch": 0.2589792060491493, | |
| "grad_norm": 1.685961718455796, | |
| "kl": 0.053466796875, | |
| "learning_rate": 1.6868813250501808e-06, | |
| "loss": 0.0021, | |
| "reward": 1.5958139896392822, | |
| "reward_std": 0.23510046303272247, | |
| "rewards/accuracy_reward": 0.595814049243927, | |
| "rewards/format_reward": 1.0, | |
| "step": 137 | |
| }, | |
| { | |
| "all_correct": 0.5, | |
| "all_wrong": 0.125, | |
| "completion_length": 92.0390625, | |
| "epoch": 0.2608695652173913, | |
| "grad_norm": 3.0694185267878074, | |
| "kl": 0.0517578125, | |
| "learning_rate": 1.682553143218654e-06, | |
| "loss": 0.0021, | |
| "reward": 1.6615285873413086, | |
| "reward_std": 0.18214064836502075, | |
| "rewards/accuracy_reward": 0.6810599565505981, | |
| "rewards/format_reward": 0.98046875, | |
| "step": 138 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 95.0703125, | |
| "epoch": 0.2627599243856333, | |
| "grad_norm": 1.8498537052874253, | |
| "kl": 0.050537109375, | |
| "learning_rate": 1.6782008887540702e-06, | |
| "loss": 0.002, | |
| "reward": 1.5477758646011353, | |
| "reward_std": 0.20087680220603943, | |
| "rewards/accuracy_reward": 0.55558842420578, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 139 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.3125, | |
| "completion_length": 90.0, | |
| "epoch": 0.2646502835538752, | |
| "grad_norm": 1.4885477074641043, | |
| "kl": 0.0556640625, | |
| "learning_rate": 1.6738247151539643e-06, | |
| "loss": 0.0022, | |
| "reward": 1.4492642879486084, | |
| "reward_std": 0.17115281522274017, | |
| "rewards/accuracy_reward": 0.4570767879486084, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 140 | |
| }, | |
| { | |
| "all_correct": 0.1875, | |
| "all_wrong": 0.1875, | |
| "completion_length": 93.39453125, | |
| "epoch": 0.2665406427221172, | |
| "grad_norm": 1.8807841599551895, | |
| "kl": 0.047119140625, | |
| "learning_rate": 1.6694247767594622e-06, | |
| "loss": 0.0019, | |
| "reward": 1.4642714262008667, | |
| "reward_std": 0.260834664106369, | |
| "rewards/accuracy_reward": 0.4838026762008667, | |
| "rewards/format_reward": 0.98046875, | |
| "step": 141 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.28125, | |
| "completion_length": 84.546875, | |
| "epoch": 0.2684310018903592, | |
| "grad_norm": 1.820094916728817, | |
| "kl": 0.059814453125, | |
| "learning_rate": 1.665001228749841e-06, | |
| "loss": 0.0024, | |
| "reward": 1.566421627998352, | |
| "reward_std": 0.1025347113609314, | |
| "rewards/accuracy_reward": 0.566421627998352, | |
| "rewards/format_reward": 1.0, | |
| "step": 142 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.28125, | |
| "completion_length": 91.03125, | |
| "epoch": 0.27032136105860116, | |
| "grad_norm": 5.252949042087521, | |
| "kl": 0.05859375, | |
| "learning_rate": 1.6605542271370511e-06, | |
| "loss": 0.0023, | |
| "reward": 1.5315755605697632, | |
| "reward_std": 0.18068033456802368, | |
| "rewards/accuracy_reward": 0.5315755605697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 143 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.25, | |
| "completion_length": 100.015625, | |
| "epoch": 0.2722117202268431, | |
| "grad_norm": 1.417138335386092, | |
| "kl": 0.050537109375, | |
| "learning_rate": 1.6560839287602191e-06, | |
| "loss": 0.002, | |
| "reward": 1.5791447162628174, | |
| "reward_std": 0.14070533215999603, | |
| "rewards/accuracy_reward": 0.5986760258674622, | |
| "rewards/format_reward": 0.98046875, | |
| "step": 144 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.3125, | |
| "completion_length": 94.4375, | |
| "epoch": 0.2741020793950851, | |
| "grad_norm": 1.8097027337596667, | |
| "kl": 0.050537109375, | |
| "learning_rate": 1.6515904912801118e-06, | |
| "loss": 0.002, | |
| "reward": 1.4946039915084839, | |
| "reward_std": 0.09259741008281708, | |
| "rewards/accuracy_reward": 0.4946039319038391, | |
| "rewards/format_reward": 1.0, | |
| "step": 145 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 90.09375, | |
| "epoch": 0.27599243856332706, | |
| "grad_norm": 1.7093281467382162, | |
| "kl": 0.056884765625, | |
| "learning_rate": 1.6470740731735786e-06, | |
| "loss": 0.0023, | |
| "reward": 1.5983502864837646, | |
| "reward_std": 0.1455744206905365, | |
| "rewards/accuracy_reward": 0.6022564768791199, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 146 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.375, | |
| "completion_length": 95.01953125, | |
| "epoch": 0.277882797731569, | |
| "grad_norm": 1.4598558142559408, | |
| "kl": 0.060546875, | |
| "learning_rate": 1.6425348337279617e-06, | |
| "loss": 0.0024, | |
| "reward": 1.524511694908142, | |
| "reward_std": 0.08749698102474213, | |
| "rewards/accuracy_reward": 0.5284179449081421, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 147 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.1875, | |
| "completion_length": 97.53515625, | |
| "epoch": 0.27977315689981097, | |
| "grad_norm": 1.6254939755919178, | |
| "kl": 0.0478515625, | |
| "learning_rate": 1.6379729330354773e-06, | |
| "loss": 0.0019, | |
| "reward": 1.5108827352523804, | |
| "reward_std": 0.1570434868335724, | |
| "rewards/accuracy_reward": 0.5108827352523804, | |
| "rewards/format_reward": 1.0, | |
| "step": 148 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.21875, | |
| "completion_length": 102.171875, | |
| "epoch": 0.28166351606805295, | |
| "grad_norm": 6.200286331983825, | |
| "kl": 0.05322265625, | |
| "learning_rate": 1.63338853198757e-06, | |
| "loss": 0.0021, | |
| "reward": 1.59765625, | |
| "reward_std": 0.15199562907218933, | |
| "rewards/accuracy_reward": 0.59765625, | |
| "rewards/format_reward": 1.0, | |
| "step": 149 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 93.01953125, | |
| "epoch": 0.2835538752362949, | |
| "grad_norm": 1.3238758411422094, | |
| "kl": 0.06005859375, | |
| "learning_rate": 1.6287817922692394e-06, | |
| "loss": 0.0024, | |
| "reward": 1.5933270454406738, | |
| "reward_std": 0.1732364296913147, | |
| "rewards/accuracy_reward": 0.593326985836029, | |
| "rewards/format_reward": 1.0, | |
| "step": 150 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.28125, | |
| "completion_length": 100.703125, | |
| "epoch": 0.28544423440453687, | |
| "grad_norm": 1.4671353082453924, | |
| "kl": 0.05615234375, | |
| "learning_rate": 1.6241528763533351e-06, | |
| "loss": 0.0022, | |
| "reward": 1.521083950996399, | |
| "reward_std": 0.14775413274765015, | |
| "rewards/accuracy_reward": 0.5249902009963989, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 151 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.21875, | |
| "completion_length": 104.98046875, | |
| "epoch": 0.28733459357277885, | |
| "grad_norm": 1.7191498537725731, | |
| "kl": 0.0517578125, | |
| "learning_rate": 1.6195019474948298e-06, | |
| "loss": 0.0021, | |
| "reward": 1.5247777700424194, | |
| "reward_std": 0.1547752171754837, | |
| "rewards/accuracy_reward": 0.5247777700424194, | |
| "rewards/format_reward": 1.0, | |
| "step": 152 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.34375, | |
| "completion_length": 95.25, | |
| "epoch": 0.2892249527410208, | |
| "grad_norm": 1.1025414542451693, | |
| "kl": 0.05224609375, | |
| "learning_rate": 1.6148291697250592e-06, | |
| "loss": 0.0021, | |
| "reward": 1.4935517311096191, | |
| "reward_std": 0.06223775073885918, | |
| "rewards/accuracy_reward": 0.49355170130729675, | |
| "rewards/format_reward": 1.0, | |
| "step": 153 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.21875, | |
| "completion_length": 97.43359375, | |
| "epoch": 0.29111531190926276, | |
| "grad_norm": 3.2260317883554293, | |
| "kl": 0.05224609375, | |
| "learning_rate": 1.6101347078459374e-06, | |
| "loss": 0.0021, | |
| "reward": 1.5610017776489258, | |
| "reward_std": 0.1826038360595703, | |
| "rewards/accuracy_reward": 0.5610017776489258, | |
| "rewards/format_reward": 1.0, | |
| "step": 154 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.125, | |
| "completion_length": 98.3828125, | |
| "epoch": 0.29300567107750475, | |
| "grad_norm": 3.2855377413213036, | |
| "kl": 0.054443359375, | |
| "learning_rate": 1.6054187274241447e-06, | |
| "loss": 0.0022, | |
| "reward": 1.6555452346801758, | |
| "reward_std": 0.21821025013923645, | |
| "rewards/accuracy_reward": 0.6828888654708862, | |
| "rewards/format_reward": 0.97265625, | |
| "step": 155 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.25, | |
| "completion_length": 99.22265625, | |
| "epoch": 0.2948960302457467, | |
| "grad_norm": 3.1851976233257986, | |
| "kl": 0.049072265625, | |
| "learning_rate": 1.6006813947852892e-06, | |
| "loss": 0.002, | |
| "reward": 1.4952456951141357, | |
| "reward_std": 0.2039714902639389, | |
| "rewards/accuracy_reward": 0.49524572491645813, | |
| "rewards/format_reward": 1.0, | |
| "step": 156 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.25, | |
| "completion_length": 92.09375, | |
| "epoch": 0.29678638941398866, | |
| "grad_norm": 2.1557933244114107, | |
| "kl": 0.055908203125, | |
| "learning_rate": 1.5959228770080389e-06, | |
| "loss": 0.0022, | |
| "reward": 1.54817795753479, | |
| "reward_std": 0.19130109250545502, | |
| "rewards/accuracy_reward": 0.54817795753479, | |
| "rewards/format_reward": 1.0, | |
| "step": 157 | |
| }, | |
| { | |
| "all_correct": 0.65625, | |
| "all_wrong": 0.0625, | |
| "completion_length": 87.30078125, | |
| "epoch": 0.29867674858223064, | |
| "grad_norm": 1.5896533313014187, | |
| "kl": 0.052490234375, | |
| "learning_rate": 1.5911433419182304e-06, | |
| "loss": 0.0021, | |
| "reward": 1.7363414764404297, | |
| "reward_std": 0.09228336811065674, | |
| "rewards/accuracy_reward": 0.7363415956497192, | |
| "rewards/format_reward": 1.0, | |
| "step": 158 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 98.4140625, | |
| "epoch": 0.3005671077504726, | |
| "grad_norm": 8.013680822640337, | |
| "kl": 0.051513671875, | |
| "learning_rate": 1.5863429580829499e-06, | |
| "loss": 0.0021, | |
| "reward": 1.567818522453308, | |
| "reward_std": 0.161808043718338, | |
| "rewards/accuracy_reward": 0.5709435343742371, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 159 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.125, | |
| "completion_length": 95.6484375, | |
| "epoch": 0.30245746691871456, | |
| "grad_norm": 1.8629346782025675, | |
| "kl": 0.0595703125, | |
| "learning_rate": 1.5815218948045877e-06, | |
| "loss": 0.0024, | |
| "reward": 1.6700589656829834, | |
| "reward_std": 0.22937864065170288, | |
| "rewards/accuracy_reward": 0.6778714656829834, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 160 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.34375, | |
| "completion_length": 90.875, | |
| "epoch": 0.30434782608695654, | |
| "grad_norm": 1.2131759411170726, | |
| "kl": 0.0556640625, | |
| "learning_rate": 1.5766803221148673e-06, | |
| "loss": 0.0022, | |
| "reward": 1.476467490196228, | |
| "reward_std": 0.1405719369649887, | |
| "rewards/accuracy_reward": 0.4764673709869385, | |
| "rewards/format_reward": 1.0, | |
| "step": 161 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 98.62109375, | |
| "epoch": 0.30623818525519847, | |
| "grad_norm": 1.575835196195424, | |
| "kl": 0.049072265625, | |
| "learning_rate": 1.571818410768848e-06, | |
| "loss": 0.002, | |
| "reward": 1.5944631099700928, | |
| "reward_std": 0.18872258067131042, | |
| "rewards/accuracy_reward": 0.594463050365448, | |
| "rewards/format_reward": 1.0, | |
| "step": 162 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.28125, | |
| "completion_length": 98.5078125, | |
| "epoch": 0.30812854442344045, | |
| "grad_norm": 1.3803997172387867, | |
| "kl": 0.04736328125, | |
| "learning_rate": 1.566936332238904e-06, | |
| "loss": 0.0019, | |
| "reward": 1.5406312942504883, | |
| "reward_std": 0.14286097884178162, | |
| "rewards/accuracy_reward": 0.5484437942504883, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 163 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 96.4296875, | |
| "epoch": 0.31001890359168244, | |
| "grad_norm": 4.219394179057573, | |
| "kl": 0.05615234375, | |
| "learning_rate": 1.5620342587086756e-06, | |
| "loss": 0.0022, | |
| "reward": 1.6263850927352905, | |
| "reward_std": 0.19135481119155884, | |
| "rewards/accuracy_reward": 0.6263850927352905, | |
| "rewards/format_reward": 1.0, | |
| "step": 164 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.28125, | |
| "completion_length": 95.87890625, | |
| "epoch": 0.31190926275992437, | |
| "grad_norm": 1.4782346468519787, | |
| "kl": 0.048095703125, | |
| "learning_rate": 1.5571123630669977e-06, | |
| "loss": 0.0019, | |
| "reward": 1.5589654445648193, | |
| "reward_std": 0.15776313841342926, | |
| "rewards/accuracy_reward": 0.5628716349601746, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 165 | |
| }, | |
| { | |
| "all_correct": 0.21875, | |
| "all_wrong": 0.21875, | |
| "completion_length": 99.24609375, | |
| "epoch": 0.31379962192816635, | |
| "grad_norm": 1.7150440549446804, | |
| "kl": 0.04736328125, | |
| "learning_rate": 1.5521708189018004e-06, | |
| "loss": 0.0019, | |
| "reward": 1.4944391250610352, | |
| "reward_std": 0.2474714070558548, | |
| "rewards/accuracy_reward": 0.4944390654563904, | |
| "rewards/format_reward": 1.0, | |
| "step": 166 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 92.48828125, | |
| "epoch": 0.31568998109640833, | |
| "grad_norm": 2.0979330226121116, | |
| "kl": 0.05224609375, | |
| "learning_rate": 1.5472098004939887e-06, | |
| "loss": 0.0021, | |
| "reward": 1.606818675994873, | |
| "reward_std": 0.19165176153182983, | |
| "rewards/accuracy_reward": 0.610724925994873, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 167 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.4375, | |
| "completion_length": 98.86328125, | |
| "epoch": 0.31758034026465026, | |
| "grad_norm": 1.741385766018968, | |
| "kl": 0.0458984375, | |
| "learning_rate": 1.5422294828112952e-06, | |
| "loss": 0.0018, | |
| "reward": 1.4513907432556152, | |
| "reward_std": 0.07461512833833694, | |
| "rewards/accuracy_reward": 0.45529699325561523, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 168 | |
| }, | |
| { | |
| "all_correct": 0.15625, | |
| "all_wrong": 0.28125, | |
| "completion_length": 104.0859375, | |
| "epoch": 0.31947069943289225, | |
| "grad_norm": 2.346864808547952, | |
| "kl": 0.041748046875, | |
| "learning_rate": 1.537230041502109e-06, | |
| "loss": 0.0017, | |
| "reward": 1.4551925659179688, | |
| "reward_std": 0.19008949398994446, | |
| "rewards/accuracy_reward": 0.46691131591796875, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 169 | |
| }, | |
| { | |
| "all_correct": 0.5, | |
| "all_wrong": 0.15625, | |
| "completion_length": 90.89453125, | |
| "epoch": 0.32136105860113423, | |
| "grad_norm": 1.6647816583702195, | |
| "kl": 0.05078125, | |
| "learning_rate": 1.5322116528892807e-06, | |
| "loss": 0.002, | |
| "reward": 1.731924295425415, | |
| "reward_std": 0.12868990004062653, | |
| "rewards/accuracy_reward": 0.7319241762161255, | |
| "rewards/format_reward": 1.0, | |
| "step": 170 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.28125, | |
| "completion_length": 97.6015625, | |
| "epoch": 0.32325141776937616, | |
| "grad_norm": 1.4603160187510866, | |
| "kl": 0.04833984375, | |
| "learning_rate": 1.527174493963905e-06, | |
| "loss": 0.0019, | |
| "reward": 1.4863568544387817, | |
| "reward_std": 0.18577060103416443, | |
| "rewards/accuracy_reward": 0.49416929483413696, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 171 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.21875, | |
| "completion_length": 95.61328125, | |
| "epoch": 0.32514177693761814, | |
| "grad_norm": 1.2779207893845528, | |
| "kl": 0.05419921875, | |
| "learning_rate": 1.5221187423790758e-06, | |
| "loss": 0.0022, | |
| "reward": 1.6187939643859863, | |
| "reward_std": 0.1073196530342102, | |
| "rewards/accuracy_reward": 0.6227001547813416, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 172 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 95.03125, | |
| "epoch": 0.3270321361058601, | |
| "grad_norm": 1.594280468798857, | |
| "kl": 0.05712890625, | |
| "learning_rate": 1.517044576443625e-06, | |
| "loss": 0.0023, | |
| "reward": 1.5663572549819946, | |
| "reward_std": 0.14794546365737915, | |
| "rewards/accuracy_reward": 0.5663573741912842, | |
| "rewards/format_reward": 1.0, | |
| "step": 173 | |
| }, | |
| { | |
| "all_correct": 0.5, | |
| "all_wrong": 0.125, | |
| "completion_length": 96.19921875, | |
| "epoch": 0.32892249527410206, | |
| "grad_norm": 35.90029700014664, | |
| "kl": 0.056396484375, | |
| "learning_rate": 1.5119521751158296e-06, | |
| "loss": 0.0023, | |
| "reward": 1.7366479635238647, | |
| "reward_std": 0.1478818655014038, | |
| "rewards/accuracy_reward": 0.7366479635238647, | |
| "rewards/format_reward": 1.0, | |
| "step": 174 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.1875, | |
| "completion_length": 94.14453125, | |
| "epoch": 0.33081285444234404, | |
| "grad_norm": 1.1879295692139324, | |
| "kl": 0.05029296875, | |
| "learning_rate": 1.5068417179971013e-06, | |
| "loss": 0.002, | |
| "reward": 1.6065101623535156, | |
| "reward_std": 0.1347871571779251, | |
| "rewards/accuracy_reward": 0.6104164123535156, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 175 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 92.0078125, | |
| "epoch": 0.332703213610586, | |
| "grad_norm": 1.7027171491945203, | |
| "kl": 0.056640625, | |
| "learning_rate": 1.5017133853256536e-06, | |
| "loss": 0.0023, | |
| "reward": 1.6415752172470093, | |
| "reward_std": 0.15971241891384125, | |
| "rewards/accuracy_reward": 0.6415751576423645, | |
| "rewards/format_reward": 1.0, | |
| "step": 176 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.25, | |
| "completion_length": 93.62109375, | |
| "epoch": 0.33459357277882795, | |
| "grad_norm": 1.3008308802468183, | |
| "kl": 0.05517578125, | |
| "learning_rate": 1.4965673579701444e-06, | |
| "loss": 0.0022, | |
| "reward": 1.5429213047027588, | |
| "reward_std": 0.11785108596086502, | |
| "rewards/accuracy_reward": 0.5546400547027588, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 177 | |
| }, | |
| { | |
| "all_correct": 0.21875, | |
| "all_wrong": 0.34375, | |
| "completion_length": 101.2578125, | |
| "epoch": 0.33648393194706994, | |
| "grad_norm": 2.0499201856789537, | |
| "kl": 0.04638671875, | |
| "learning_rate": 1.4914038174232954e-06, | |
| "loss": 0.0019, | |
| "reward": 1.4305205345153809, | |
| "reward_std": 0.18109694123268127, | |
| "rewards/accuracy_reward": 0.4383331537246704, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 178 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.15625, | |
| "completion_length": 94.9765625, | |
| "epoch": 0.3383742911153119, | |
| "grad_norm": 1.3560089780453208, | |
| "kl": 0.051025390625, | |
| "learning_rate": 1.4862229457954937e-06, | |
| "loss": 0.002, | |
| "reward": 1.6644376516342163, | |
| "reward_std": 0.12696640193462372, | |
| "rewards/accuracy_reward": 0.6644376516342163, | |
| "rewards/format_reward": 1.0, | |
| "step": 179 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 95.62109375, | |
| "epoch": 0.34026465028355385, | |
| "grad_norm": 1.312847563615755, | |
| "kl": 0.052490234375, | |
| "learning_rate": 1.4810249258083676e-06, | |
| "loss": 0.0021, | |
| "reward": 1.621772050857544, | |
| "reward_std": 0.15236923098564148, | |
| "rewards/accuracy_reward": 0.621772050857544, | |
| "rewards/format_reward": 1.0, | |
| "step": 180 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.3125, | |
| "completion_length": 97.97265625, | |
| "epoch": 0.34215500945179583, | |
| "grad_norm": 1.5335653397548066, | |
| "kl": 0.047119140625, | |
| "learning_rate": 1.475809940788342e-06, | |
| "loss": 0.0019, | |
| "reward": 1.5216861963272095, | |
| "reward_std": 0.09969654679298401, | |
| "rewards/accuracy_reward": 0.5255923867225647, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 181 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 90.13671875, | |
| "epoch": 0.3440453686200378, | |
| "grad_norm": 1.6455259403603757, | |
| "kl": 0.050537109375, | |
| "learning_rate": 1.4705781746601738e-06, | |
| "loss": 0.002, | |
| "reward": 1.569726586341858, | |
| "reward_std": 0.20621807873249054, | |
| "rewards/accuracy_reward": 0.5736328363418579, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 182 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 95.23828125, | |
| "epoch": 0.34593572778827975, | |
| "grad_norm": 1.7087145386845783, | |
| "kl": 0.048095703125, | |
| "learning_rate": 1.4653298119404645e-06, | |
| "loss": 0.0019, | |
| "reward": 1.566096544265747, | |
| "reward_std": 0.16752052307128906, | |
| "rewards/accuracy_reward": 0.5660964846611023, | |
| "rewards/format_reward": 1.0, | |
| "step": 183 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.28125, | |
| "completion_length": 86.921875, | |
| "epoch": 0.34782608695652173, | |
| "grad_norm": 1.4554606229566311, | |
| "kl": 0.054443359375, | |
| "learning_rate": 1.460065037731152e-06, | |
| "loss": 0.0022, | |
| "reward": 1.592024326324463, | |
| "reward_std": 0.10995283722877502, | |
| "rewards/accuracy_reward": 0.5920243859291077, | |
| "rewards/format_reward": 1.0, | |
| "step": 184 | |
| }, | |
| { | |
| "all_correct": 0.5, | |
| "all_wrong": 0.21875, | |
| "completion_length": 94.37890625, | |
| "epoch": 0.3497164461247637, | |
| "grad_norm": 1.030689978981618, | |
| "kl": 0.044921875, | |
| "learning_rate": 1.454784037712984e-06, | |
| "loss": 0.0018, | |
| "reward": 1.6271023750305176, | |
| "reward_std": 0.09506059437990189, | |
| "rewards/accuracy_reward": 0.6271023750305176, | |
| "rewards/format_reward": 1.0, | |
| "step": 185 | |
| }, | |
| { | |
| "all_correct": 0.5625, | |
| "all_wrong": 0.1875, | |
| "completion_length": 92.4765625, | |
| "epoch": 0.3516068052930057, | |
| "grad_norm": 2.3312930785993657, | |
| "kl": 0.05224609375, | |
| "learning_rate": 1.449486998138968e-06, | |
| "loss": 0.0021, | |
| "reward": 1.6988677978515625, | |
| "reward_std": 0.08607158064842224, | |
| "rewards/accuracy_reward": 0.698867678642273, | |
| "rewards/format_reward": 1.0, | |
| "step": 186 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.3125, | |
| "completion_length": 94.50390625, | |
| "epoch": 0.3534971644612476, | |
| "grad_norm": 1.370501060551843, | |
| "kl": 0.050048828125, | |
| "learning_rate": 1.4441741058278024e-06, | |
| "loss": 0.002, | |
| "reward": 1.5129822492599487, | |
| "reward_std": 0.1419016569852829, | |
| "rewards/accuracy_reward": 0.5129822492599487, | |
| "rewards/format_reward": 1.0, | |
| "step": 187 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 89.53515625, | |
| "epoch": 0.3553875236294896, | |
| "grad_norm": 1.4724253630775335, | |
| "kl": 0.048828125, | |
| "learning_rate": 1.4388455481572878e-06, | |
| "loss": 0.002, | |
| "reward": 1.6355903148651123, | |
| "reward_std": 0.14424622058868408, | |
| "rewards/accuracy_reward": 0.6394965648651123, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 188 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.21875, | |
| "completion_length": 99.15234375, | |
| "epoch": 0.3572778827977316, | |
| "grad_norm": 1.1589530938405725, | |
| "kl": 0.046142578125, | |
| "learning_rate": 1.4335015130577198e-06, | |
| "loss": 0.0018, | |
| "reward": 1.6473121643066406, | |
| "reward_std": 0.10182252526283264, | |
| "rewards/accuracy_reward": 0.6473122239112854, | |
| "rewards/format_reward": 1.0, | |
| "step": 189 | |
| }, | |
| { | |
| "all_correct": 0.5, | |
| "all_wrong": 0.125, | |
| "completion_length": 86.296875, | |
| "epoch": 0.3591682419659735, | |
| "grad_norm": 1.5269460633562093, | |
| "kl": 0.04541015625, | |
| "learning_rate": 1.428142189005259e-06, | |
| "loss": 0.0018, | |
| "reward": 1.7168800830841064, | |
| "reward_std": 0.12831273674964905, | |
| "rewards/accuracy_reward": 0.7168800234794617, | |
| "rewards/format_reward": 1.0, | |
| "step": 190 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.125, | |
| "completion_length": 90.84765625, | |
| "epoch": 0.3610586011342155, | |
| "grad_norm": 1.5710398769636844, | |
| "kl": 0.051513671875, | |
| "learning_rate": 1.4227677650152847e-06, | |
| "loss": 0.0021, | |
| "reward": 1.6753089427947998, | |
| "reward_std": 0.15371738374233246, | |
| "rewards/accuracy_reward": 0.6753089427947998, | |
| "rewards/format_reward": 1.0, | |
| "step": 191 | |
| }, | |
| { | |
| "all_correct": 0.5625, | |
| "all_wrong": 0.15625, | |
| "completion_length": 97.41015625, | |
| "epoch": 0.3629489603024575, | |
| "grad_norm": 1.3229536281928582, | |
| "kl": 0.0478515625, | |
| "learning_rate": 1.417378430635729e-06, | |
| "loss": 0.0019, | |
| "reward": 1.6942713260650635, | |
| "reward_std": 0.08952474594116211, | |
| "rewards/accuracy_reward": 0.6942713260650635, | |
| "rewards/format_reward": 1.0, | |
| "step": 192 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 85.03125, | |
| "epoch": 0.3648393194706994, | |
| "grad_norm": 2.579144764904904, | |
| "kl": 0.0537109375, | |
| "learning_rate": 1.4119743759403907e-06, | |
| "loss": 0.0021, | |
| "reward": 1.609615683555603, | |
| "reward_std": 0.1612345427274704, | |
| "rewards/accuracy_reward": 0.609615683555603, | |
| "rewards/format_reward": 1.0, | |
| "step": 193 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.125, | |
| "completion_length": 98.421875, | |
| "epoch": 0.3667296786389414, | |
| "grad_norm": 4.601074879313099, | |
| "kl": 0.055908203125, | |
| "learning_rate": 1.406555791522232e-06, | |
| "loss": 0.0022, | |
| "reward": 1.6692792177200317, | |
| "reward_std": 0.1721545159816742, | |
| "rewards/accuracy_reward": 0.6731854677200317, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 194 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.25, | |
| "completion_length": 86.79296875, | |
| "epoch": 0.3686200378071834, | |
| "grad_norm": 1.505355960378617, | |
| "kl": 0.048095703125, | |
| "learning_rate": 1.401122868486658e-06, | |
| "loss": 0.0019, | |
| "reward": 1.6007030010223389, | |
| "reward_std": 0.1112457737326622, | |
| "rewards/accuracy_reward": 0.6007030606269836, | |
| "rewards/format_reward": 1.0, | |
| "step": 195 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.1875, | |
| "completion_length": 92.05078125, | |
| "epoch": 0.3705103969754253, | |
| "grad_norm": 4.21038845661736, | |
| "kl": 0.048095703125, | |
| "learning_rate": 1.3956757984447744e-06, | |
| "loss": 0.0019, | |
| "reward": 1.5570985078811646, | |
| "reward_std": 0.15561020374298096, | |
| "rewards/accuracy_reward": 0.5570985078811646, | |
| "rewards/format_reward": 1.0, | |
| "step": 196 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.1875, | |
| "completion_length": 108.05859375, | |
| "epoch": 0.3724007561436673, | |
| "grad_norm": 1.8421976998281828, | |
| "kl": 0.0458984375, | |
| "learning_rate": 1.3902147735066305e-06, | |
| "loss": 0.0018, | |
| "reward": 1.6113016605377197, | |
| "reward_std": 0.153774231672287, | |
| "rewards/accuracy_reward": 0.615207850933075, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 197 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 100.32421875, | |
| "epoch": 0.3742911153119093, | |
| "grad_norm": 1.4377049848914711, | |
| "kl": 0.048583984375, | |
| "learning_rate": 1.3847399862744449e-06, | |
| "loss": 0.0019, | |
| "reward": 1.5971312522888184, | |
| "reward_std": 0.1938011348247528, | |
| "rewards/accuracy_reward": 0.6010375022888184, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 198 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.09375, | |
| "completion_length": 94.0, | |
| "epoch": 0.3761814744801512, | |
| "grad_norm": 2.662364277169509, | |
| "kl": 0.053466796875, | |
| "learning_rate": 1.3792516298358115e-06, | |
| "loss": 0.0021, | |
| "reward": 1.6497100591659546, | |
| "reward_std": 0.12323208153247833, | |
| "rewards/accuracy_reward": 0.6497100591659546, | |
| "rewards/format_reward": 1.0, | |
| "step": 199 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 93.59765625, | |
| "epoch": 0.3780718336483932, | |
| "grad_norm": 1.7035529891437522, | |
| "kl": 0.049560546875, | |
| "learning_rate": 1.37374989775689e-06, | |
| "loss": 0.002, | |
| "reward": 1.563701868057251, | |
| "reward_std": 0.1927732229232788, | |
| "rewards/accuracy_reward": 0.563701868057251, | |
| "rewards/format_reward": 1.0, | |
| "step": 200 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 85.83203125, | |
| "epoch": 0.3799621928166352, | |
| "grad_norm": 1.2571615120784285, | |
| "kl": 0.0537109375, | |
| "learning_rate": 1.3682349840755786e-06, | |
| "loss": 0.0021, | |
| "reward": 1.6499078273773193, | |
| "reward_std": 0.12151362746953964, | |
| "rewards/accuracy_reward": 0.6538141369819641, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 201 | |
| }, | |
| { | |
| "all_correct": 0.1875, | |
| "all_wrong": 0.1875, | |
| "completion_length": 92.55078125, | |
| "epoch": 0.3818525519848771, | |
| "grad_norm": 1.8828233098315525, | |
| "kl": 0.052978515625, | |
| "learning_rate": 1.3627070832946716e-06, | |
| "loss": 0.0021, | |
| "reward": 1.5092294216156006, | |
| "reward_std": 0.23872140049934387, | |
| "rewards/accuracy_reward": 0.5092294216156006, | |
| "rewards/format_reward": 1.0, | |
| "step": 202 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.1875, | |
| "completion_length": 90.94140625, | |
| "epoch": 0.3837429111531191, | |
| "grad_norm": 1.4414792921156796, | |
| "kl": 0.04833984375, | |
| "learning_rate": 1.3571663903749984e-06, | |
| "loss": 0.0019, | |
| "reward": 1.5634148120880127, | |
| "reward_std": 0.18181678652763367, | |
| "rewards/accuracy_reward": 0.5673210024833679, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 203 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 93.390625, | |
| "epoch": 0.3856332703213611, | |
| "grad_norm": 1.76318060561253, | |
| "kl": 0.05615234375, | |
| "learning_rate": 1.351613100728548e-06, | |
| "loss": 0.0022, | |
| "reward": 1.5766992568969727, | |
| "reward_std": 0.18105342984199524, | |
| "rewards/accuracy_reward": 0.5766991972923279, | |
| "rewards/format_reward": 1.0, | |
| "step": 204 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 91.85546875, | |
| "epoch": 0.387523629489603, | |
| "grad_norm": 1.498906587190416, | |
| "kl": 0.046142578125, | |
| "learning_rate": 1.3460474102115784e-06, | |
| "loss": 0.0018, | |
| "reward": 1.5816829204559326, | |
| "reward_std": 0.2008872926235199, | |
| "rewards/accuracy_reward": 0.5816829204559326, | |
| "rewards/format_reward": 1.0, | |
| "step": 205 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.1875, | |
| "completion_length": 91.69921875, | |
| "epoch": 0.389413988657845, | |
| "grad_norm": 2.5901803764065447, | |
| "kl": 0.052001953125, | |
| "learning_rate": 1.340469515117706e-06, | |
| "loss": 0.0021, | |
| "reward": 1.5882611274719238, | |
| "reward_std": 0.1939556896686554, | |
| "rewards/accuracy_reward": 0.5882611274719238, | |
| "rewards/format_reward": 1.0, | |
| "step": 206 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.125, | |
| "completion_length": 85.96875, | |
| "epoch": 0.391304347826087, | |
| "grad_norm": 1.972889284821711, | |
| "kl": 0.05615234375, | |
| "learning_rate": 1.334879612170986e-06, | |
| "loss": 0.0022, | |
| "reward": 1.667292833328247, | |
| "reward_std": 0.2238282561302185, | |
| "rewards/accuracy_reward": 0.6672928929328918, | |
| "rewards/format_reward": 1.0, | |
| "step": 207 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.25, | |
| "completion_length": 93.765625, | |
| "epoch": 0.3931947069943289, | |
| "grad_norm": 1.7336225636379132, | |
| "kl": 0.0556640625, | |
| "learning_rate": 1.3292778985189722e-06, | |
| "loss": 0.0022, | |
| "reward": 1.4546148777008057, | |
| "reward_std": 0.202288419008255, | |
| "rewards/accuracy_reward": 0.4546148180961609, | |
| "rewards/format_reward": 1.0, | |
| "step": 208 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.25, | |
| "completion_length": 93.42578125, | |
| "epoch": 0.3950850661625709, | |
| "grad_norm": 1.5326638144326488, | |
| "kl": 0.04296875, | |
| "learning_rate": 1.323664571725764e-06, | |
| "loss": 0.0017, | |
| "reward": 1.581210970878601, | |
| "reward_std": 0.13675948977470398, | |
| "rewards/accuracy_reward": 0.5812109708786011, | |
| "rewards/format_reward": 1.0, | |
| "step": 209 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.25, | |
| "completion_length": 90.45703125, | |
| "epoch": 0.39697542533081287, | |
| "grad_norm": 1.6717753066186387, | |
| "kl": 0.055419921875, | |
| "learning_rate": 1.3180398297650392e-06, | |
| "loss": 0.0022, | |
| "reward": 1.5307865142822266, | |
| "reward_std": 0.21199406683444977, | |
| "rewards/accuracy_reward": 0.534692645072937, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 210 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.15625, | |
| "completion_length": 88.4375, | |
| "epoch": 0.3988657844990548, | |
| "grad_norm": 3.0589163307910163, | |
| "kl": 0.045654296875, | |
| "learning_rate": 1.3124038710130721e-06, | |
| "loss": 0.0018, | |
| "reward": 1.4704865217208862, | |
| "reward_std": 0.22353151440620422, | |
| "rewards/accuracy_reward": 0.470486581325531, | |
| "rewards/format_reward": 1.0, | |
| "step": 211 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.21875, | |
| "completion_length": 91.19140625, | |
| "epoch": 0.4007561436672968, | |
| "grad_norm": 1.7075754827161425, | |
| "kl": 0.052734375, | |
| "learning_rate": 1.3067568942417354e-06, | |
| "loss": 0.0021, | |
| "reward": 1.568968415260315, | |
| "reward_std": 0.19819122552871704, | |
| "rewards/accuracy_reward": 0.5689684152603149, | |
| "rewards/format_reward": 1.0, | |
| "step": 212 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.15625, | |
| "completion_length": 92.08984375, | |
| "epoch": 0.40264650283553877, | |
| "grad_norm": 2.1354813874349254, | |
| "kl": 0.05029296875, | |
| "learning_rate": 1.3010990986114924e-06, | |
| "loss": 0.002, | |
| "reward": 1.533203125, | |
| "reward_std": 0.2084732949733734, | |
| "rewards/accuracy_reward": 0.552734375, | |
| "rewards/format_reward": 0.98046875, | |
| "step": 213 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.21875, | |
| "completion_length": 94.58203125, | |
| "epoch": 0.4045368620037807, | |
| "grad_norm": 2.165953349203724, | |
| "kl": 0.04345703125, | |
| "learning_rate": 1.29543068366437e-06, | |
| "loss": 0.0017, | |
| "reward": 1.511252760887146, | |
| "reward_std": 0.2207871973514557, | |
| "rewards/accuracy_reward": 0.511252760887146, | |
| "rewards/format_reward": 1.0, | |
| "step": 214 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.28125, | |
| "completion_length": 91.39453125, | |
| "epoch": 0.4064272211720227, | |
| "grad_norm": 1.8035046342920265, | |
| "kl": 0.05322265625, | |
| "learning_rate": 1.2897518493169238e-06, | |
| "loss": 0.0021, | |
| "reward": 1.5383806228637695, | |
| "reward_std": 0.11602434515953064, | |
| "rewards/accuracy_reward": 0.54228675365448, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 215 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 85.265625, | |
| "epoch": 0.40831758034026466, | |
| "grad_norm": 15.766212379911176, | |
| "kl": 0.045654296875, | |
| "learning_rate": 1.284062795853185e-06, | |
| "loss": 0.0018, | |
| "reward": 1.633461356163025, | |
| "reward_std": 0.1580500602722168, | |
| "rewards/accuracy_reward": 0.6334613561630249, | |
| "rewards/format_reward": 1.0, | |
| "step": 216 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.09375, | |
| "completion_length": 91.390625, | |
| "epoch": 0.4102079395085066, | |
| "grad_norm": 1.5655799499341232, | |
| "kl": 0.05078125, | |
| "learning_rate": 1.2783637239175992e-06, | |
| "loss": 0.002, | |
| "reward": 1.670259952545166, | |
| "reward_std": 0.18198764324188232, | |
| "rewards/accuracy_reward": 0.670259952545166, | |
| "rewards/format_reward": 1.0, | |
| "step": 217 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.125, | |
| "completion_length": 86.6171875, | |
| "epoch": 0.4120982986767486, | |
| "grad_norm": 3.0507186922583482, | |
| "kl": 0.0517578125, | |
| "learning_rate": 1.2726548345079474e-06, | |
| "loss": 0.0021, | |
| "reward": 1.6517325639724731, | |
| "reward_std": 0.17904990911483765, | |
| "rewards/accuracy_reward": 0.6517325639724731, | |
| "rewards/format_reward": 1.0, | |
| "step": 218 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 92.4140625, | |
| "epoch": 0.41398865784499056, | |
| "grad_norm": 2.412347509180631, | |
| "kl": 0.049560546875, | |
| "learning_rate": 1.2669363289682581e-06, | |
| "loss": 0.002, | |
| "reward": 1.55078125, | |
| "reward_std": 0.20410458743572235, | |
| "rewards/accuracy_reward": 0.57421875, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 219 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.1875, | |
| "completion_length": 100.796875, | |
| "epoch": 0.4158790170132325, | |
| "grad_norm": 1.1653385467510216, | |
| "kl": 0.05078125, | |
| "learning_rate": 1.261208408981708e-06, | |
| "loss": 0.002, | |
| "reward": 1.5812370777130127, | |
| "reward_std": 0.16820110380649567, | |
| "rewards/accuracy_reward": 0.5968619585037231, | |
| "rewards/format_reward": 0.984375, | |
| "step": 220 | |
| }, | |
| { | |
| "all_correct": 0.5, | |
| "all_wrong": 0.1875, | |
| "completion_length": 89.671875, | |
| "epoch": 0.41776937618147447, | |
| "grad_norm": 1.4929073817173293, | |
| "kl": 0.048828125, | |
| "learning_rate": 1.2554712765635057e-06, | |
| "loss": 0.0019, | |
| "reward": 1.6370710134506226, | |
| "reward_std": 0.11428119242191315, | |
| "rewards/accuracy_reward": 0.6370710134506226, | |
| "rewards/format_reward": 1.0, | |
| "step": 221 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.1875, | |
| "completion_length": 85.70703125, | |
| "epoch": 0.41965973534971646, | |
| "grad_norm": 2.2938159154637265, | |
| "kl": 0.052490234375, | |
| "learning_rate": 1.2497251340537688e-06, | |
| "loss": 0.0021, | |
| "reward": 1.5362218618392944, | |
| "reward_std": 0.2033883035182953, | |
| "rewards/accuracy_reward": 0.5440343022346497, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 222 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.09375, | |
| "completion_length": 88.765625, | |
| "epoch": 0.4215500945179584, | |
| "grad_norm": 1.590914762810909, | |
| "kl": 0.045654296875, | |
| "learning_rate": 1.2439701841103886e-06, | |
| "loss": 0.0018, | |
| "reward": 1.672126293182373, | |
| "reward_std": 0.18559589982032776, | |
| "rewards/accuracy_reward": 0.6721263527870178, | |
| "rewards/format_reward": 1.0, | |
| "step": 223 | |
| }, | |
| { | |
| "all_correct": 0.21875, | |
| "all_wrong": 0.21875, | |
| "completion_length": 88.734375, | |
| "epoch": 0.42344045368620037, | |
| "grad_norm": 4.4273662063319925, | |
| "kl": 0.051025390625, | |
| "learning_rate": 1.2382066297018804e-06, | |
| "loss": 0.002, | |
| "reward": 1.5649325847625732, | |
| "reward_std": 0.2352992594242096, | |
| "rewards/accuracy_reward": 0.5649325847625732, | |
| "rewards/format_reward": 1.0, | |
| "step": 224 | |
| }, | |
| { | |
| "all_correct": 0.5, | |
| "all_wrong": 0.1875, | |
| "completion_length": 88.78515625, | |
| "epoch": 0.42533081285444235, | |
| "grad_norm": 2.4145377912675245, | |
| "kl": 0.048828125, | |
| "learning_rate": 1.2324346741002259e-06, | |
| "loss": 0.002, | |
| "reward": 1.6205267906188965, | |
| "reward_std": 0.1337902843952179, | |
| "rewards/accuracy_reward": 0.6205266714096069, | |
| "rewards/format_reward": 1.0, | |
| "step": 225 | |
| }, | |
| { | |
| "all_correct": 0.21875, | |
| "all_wrong": 0.28125, | |
| "completion_length": 87.2734375, | |
| "epoch": 0.42722117202268434, | |
| "grad_norm": 1.3878030528846113, | |
| "kl": 0.049072265625, | |
| "learning_rate": 1.2266545208737054e-06, | |
| "loss": 0.002, | |
| "reward": 1.4790351390838623, | |
| "reward_std": 0.1779412180185318, | |
| "rewards/accuracy_reward": 0.47903522849082947, | |
| "rewards/format_reward": 1.0, | |
| "step": 226 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.3125, | |
| "completion_length": 85.23046875, | |
| "epoch": 0.42911153119092627, | |
| "grad_norm": 2.600137524410405, | |
| "kl": 0.05126953125, | |
| "learning_rate": 1.2208663738797165e-06, | |
| "loss": 0.0021, | |
| "reward": 1.5255839824676514, | |
| "reward_std": 0.10616068542003632, | |
| "rewards/accuracy_reward": 0.5255839824676514, | |
| "rewards/format_reward": 1.0, | |
| "step": 227 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.25, | |
| "completion_length": 89.4375, | |
| "epoch": 0.43100189035916825, | |
| "grad_norm": 2.29638301880618, | |
| "kl": 0.0517578125, | |
| "learning_rate": 1.2150704372575853e-06, | |
| "loss": 0.0021, | |
| "reward": 1.525526762008667, | |
| "reward_std": 0.1583402007818222, | |
| "rewards/accuracy_reward": 0.5255266427993774, | |
| "rewards/format_reward": 1.0, | |
| "step": 228 | |
| }, | |
| { | |
| "all_correct": 0.53125, | |
| "all_wrong": 0.21875, | |
| "completion_length": 84.92578125, | |
| "epoch": 0.43289224952741023, | |
| "grad_norm": 1.144130055159072, | |
| "kl": 0.05322265625, | |
| "learning_rate": 1.2092669154213664e-06, | |
| "loss": 0.0021, | |
| "reward": 1.5835583209991455, | |
| "reward_std": 0.09602068364620209, | |
| "rewards/accuracy_reward": 0.5874645113945007, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 229 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.15625, | |
| "completion_length": 102.9765625, | |
| "epoch": 0.43478260869565216, | |
| "grad_norm": 2.039569056317988, | |
| "kl": 0.048583984375, | |
| "learning_rate": 1.203456013052634e-06, | |
| "loss": 0.0019, | |
| "reward": 1.5296072959899902, | |
| "reward_std": 0.2428017556667328, | |
| "rewards/accuracy_reward": 0.529607355594635, | |
| "rewards/format_reward": 1.0, | |
| "step": 230 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.1875, | |
| "completion_length": 100.609375, | |
| "epoch": 0.43667296786389415, | |
| "grad_norm": 1.8374962293393675, | |
| "kl": 0.046630859375, | |
| "learning_rate": 1.1976379350932618e-06, | |
| "loss": 0.0019, | |
| "reward": 1.6126770973205566, | |
| "reward_std": 0.17599979043006897, | |
| "rewards/accuracy_reward": 0.6126769781112671, | |
| "rewards/format_reward": 1.0, | |
| "step": 231 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.15625, | |
| "completion_length": 86.33203125, | |
| "epoch": 0.43856332703213613, | |
| "grad_norm": 1.3726940756027504, | |
| "kl": 0.04638671875, | |
| "learning_rate": 1.1918128867381965e-06, | |
| "loss": 0.0019, | |
| "reward": 1.6991832256317139, | |
| "reward_std": 0.14738750457763672, | |
| "rewards/accuracy_reward": 0.7030894160270691, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 232 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.3125, | |
| "completion_length": 94.38671875, | |
| "epoch": 0.44045368620037806, | |
| "grad_norm": 2.07132527283432, | |
| "kl": 0.044189453125, | |
| "learning_rate": 1.1859810734282207e-06, | |
| "loss": 0.0018, | |
| "reward": 1.5090982913970947, | |
| "reward_std": 0.12135301530361176, | |
| "rewards/accuracy_reward": 0.5325357913970947, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 233 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.34375, | |
| "completion_length": 89.13671875, | |
| "epoch": 0.44234404536862004, | |
| "grad_norm": 1.261505450158093, | |
| "kl": 0.049072265625, | |
| "learning_rate": 1.1801427008427063e-06, | |
| "loss": 0.002, | |
| "reward": 1.5006786584854126, | |
| "reward_std": 0.11618545651435852, | |
| "rewards/accuracy_reward": 0.5006786584854126, | |
| "rewards/format_reward": 1.0, | |
| "step": 234 | |
| }, | |
| { | |
| "all_correct": 0.5625, | |
| "all_wrong": 0.15625, | |
| "completion_length": 88.6015625, | |
| "epoch": 0.444234404536862, | |
| "grad_norm": 1.6264673282441953, | |
| "kl": 0.048583984375, | |
| "learning_rate": 1.1742979748923608e-06, | |
| "loss": 0.0019, | |
| "reward": 1.7040953636169434, | |
| "reward_std": 0.08967425674200058, | |
| "rewards/accuracy_reward": 0.7040954232215881, | |
| "rewards/format_reward": 1.0, | |
| "step": 235 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 87.04296875, | |
| "epoch": 0.44612476370510395, | |
| "grad_norm": 1.8797855226580642, | |
| "kl": 0.055908203125, | |
| "learning_rate": 1.1684471017119665e-06, | |
| "loss": 0.0022, | |
| "reward": 1.5616300106048584, | |
| "reward_std": 0.10765929520130157, | |
| "rewards/accuracy_reward": 0.565536379814148, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 236 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.25, | |
| "completion_length": 95.9375, | |
| "epoch": 0.44801512287334594, | |
| "grad_norm": 1.3012401350602132, | |
| "kl": 0.04541015625, | |
| "learning_rate": 1.1625902876531083e-06, | |
| "loss": 0.0018, | |
| "reward": 1.4816043376922607, | |
| "reward_std": 0.16820675134658813, | |
| "rewards/accuracy_reward": 0.5011356472969055, | |
| "rewards/format_reward": 0.98046875, | |
| "step": 237 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.09375, | |
| "completion_length": 92.08203125, | |
| "epoch": 0.4499054820415879, | |
| "grad_norm": 2.6028023149547193, | |
| "kl": 0.046630859375, | |
| "learning_rate": 1.156727739276897e-06, | |
| "loss": 0.0019, | |
| "reward": 1.7202575206756592, | |
| "reward_std": 0.14813324809074402, | |
| "rewards/accuracy_reward": 0.7241637706756592, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 238 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.28125, | |
| "completion_length": 99.9296875, | |
| "epoch": 0.45179584120982985, | |
| "grad_norm": 2.287302725917376, | |
| "kl": 0.048583984375, | |
| "learning_rate": 1.1508596633466853e-06, | |
| "loss": 0.0019, | |
| "reward": 1.5219180583953857, | |
| "reward_std": 0.0942273810505867, | |
| "rewards/accuracy_reward": 0.5492618083953857, | |
| "rewards/format_reward": 0.97265625, | |
| "step": 239 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.25, | |
| "completion_length": 101.42578125, | |
| "epoch": 0.45368620037807184, | |
| "grad_norm": 3.1114222325682803, | |
| "kl": 0.04541015625, | |
| "learning_rate": 1.1449862668207732e-06, | |
| "loss": 0.0018, | |
| "reward": 1.4496355056762695, | |
| "reward_std": 0.19880539178848267, | |
| "rewards/accuracy_reward": 0.46916675567626953, | |
| "rewards/format_reward": 0.98046875, | |
| "step": 240 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 97.19140625, | |
| "epoch": 0.4555765595463138, | |
| "grad_norm": 1.8268906039007968, | |
| "kl": 0.042724609375, | |
| "learning_rate": 1.1391077568451115e-06, | |
| "loss": 0.0017, | |
| "reward": 1.6206369400024414, | |
| "reward_std": 0.1697978675365448, | |
| "rewards/accuracy_reward": 0.6284493803977966, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 241 | |
| }, | |
| { | |
| "all_correct": 0.5625, | |
| "all_wrong": 0.125, | |
| "completion_length": 83.81640625, | |
| "epoch": 0.45746691871455575, | |
| "grad_norm": 3.936269420804981, | |
| "kl": 0.0439453125, | |
| "learning_rate": 1.1332243407459938e-06, | |
| "loss": 0.0018, | |
| "reward": 1.7336182594299316, | |
| "reward_std": 0.11902132630348206, | |
| "rewards/accuracy_reward": 0.7336182594299316, | |
| "rewards/format_reward": 1.0, | |
| "step": 242 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 94.34765625, | |
| "epoch": 0.45935727788279773, | |
| "grad_norm": 2.638770141408573, | |
| "kl": 0.048095703125, | |
| "learning_rate": 1.1273362260227457e-06, | |
| "loss": 0.0019, | |
| "reward": 1.6561558246612549, | |
| "reward_std": 0.16947349905967712, | |
| "rewards/accuracy_reward": 0.6600620746612549, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 243 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.125, | |
| "completion_length": 90.83984375, | |
| "epoch": 0.4612476370510397, | |
| "grad_norm": 1.8631283133654752, | |
| "kl": 0.050048828125, | |
| "learning_rate": 1.121443620340406e-06, | |
| "loss": 0.002, | |
| "reward": 1.65234375, | |
| "reward_std": 0.19478288292884827, | |
| "rewards/accuracy_reward": 0.68359375, | |
| "rewards/format_reward": 0.96875, | |
| "step": 244 | |
| }, | |
| { | |
| "all_correct": 0.21875, | |
| "all_wrong": 0.28125, | |
| "completion_length": 97.4296875, | |
| "epoch": 0.46313799621928164, | |
| "grad_norm": 1.614309504872873, | |
| "kl": 0.044189453125, | |
| "learning_rate": 1.1155467315224037e-06, | |
| "loss": 0.0018, | |
| "reward": 1.4352020025253296, | |
| "reward_std": 0.1766517162322998, | |
| "rewards/accuracy_reward": 0.435202032327652, | |
| "rewards/format_reward": 1.0, | |
| "step": 245 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 91.640625, | |
| "epoch": 0.46502835538752363, | |
| "grad_norm": 1.6379757636235799, | |
| "kl": 0.0478515625, | |
| "learning_rate": 1.1096457675432264e-06, | |
| "loss": 0.0019, | |
| "reward": 1.5429686307907104, | |
| "reward_std": 0.14759376645088196, | |
| "rewards/accuracy_reward": 0.5429686307907104, | |
| "rewards/format_reward": 1.0, | |
| "step": 246 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.0625, | |
| "completion_length": 91.984375, | |
| "epoch": 0.4669187145557656, | |
| "grad_norm": 2.5291223401978296, | |
| "kl": 0.044921875, | |
| "learning_rate": 1.1037409365210879e-06, | |
| "loss": 0.0018, | |
| "reward": 1.6242541074752808, | |
| "reward_std": 0.2227800339460373, | |
| "rewards/accuracy_reward": 0.6281603574752808, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 247 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.1875, | |
| "completion_length": 95.96875, | |
| "epoch": 0.46880907372400754, | |
| "grad_norm": 1.5228770177382556, | |
| "kl": 0.046630859375, | |
| "learning_rate": 1.0978324467105857e-06, | |
| "loss": 0.0019, | |
| "reward": 1.575097680091858, | |
| "reward_std": 0.1790701001882553, | |
| "rewards/accuracy_reward": 0.5790039300918579, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 248 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.21875, | |
| "completion_length": 91.23828125, | |
| "epoch": 0.4706994328922495, | |
| "grad_norm": 2.489169729265948, | |
| "kl": 0.053466796875, | |
| "learning_rate": 1.0919205064953581e-06, | |
| "loss": 0.0021, | |
| "reward": 1.5097450017929077, | |
| "reward_std": 0.20132069289684296, | |
| "rewards/accuracy_reward": 0.5097450017929077, | |
| "rewards/format_reward": 1.0, | |
| "step": 249 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.28125, | |
| "completion_length": 85.8046875, | |
| "epoch": 0.4725897920604915, | |
| "grad_norm": 2.586228799524351, | |
| "kl": 0.0498046875, | |
| "learning_rate": 1.0860053243807336e-06, | |
| "loss": 0.002, | |
| "reward": 1.514784574508667, | |
| "reward_std": 0.14834506809711456, | |
| "rewards/accuracy_reward": 0.514784574508667, | |
| "rewards/format_reward": 1.0, | |
| "step": 250 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.15625, | |
| "completion_length": 95.62109375, | |
| "epoch": 0.47448015122873344, | |
| "grad_norm": 1.713585099464109, | |
| "kl": 0.042236328125, | |
| "learning_rate": 1.0800871089863784e-06, | |
| "loss": 0.0017, | |
| "reward": 1.586524248123169, | |
| "reward_std": 0.20916607975959778, | |
| "rewards/accuracy_reward": 0.586524248123169, | |
| "rewards/format_reward": 1.0, | |
| "step": 251 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 84.66015625, | |
| "epoch": 0.4763705103969754, | |
| "grad_norm": 2.02730635206175, | |
| "kl": 0.052490234375, | |
| "learning_rate": 1.0741660690389365e-06, | |
| "loss": 0.0021, | |
| "reward": 1.6193575859069824, | |
| "reward_std": 0.1369372010231018, | |
| "rewards/accuracy_reward": 0.6193576455116272, | |
| "rewards/format_reward": 1.0, | |
| "step": 252 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.25, | |
| "completion_length": 94.953125, | |
| "epoch": 0.4782608695652174, | |
| "grad_norm": 2.0450185647249373, | |
| "kl": 0.04541015625, | |
| "learning_rate": 1.068242413364671e-06, | |
| "loss": 0.0018, | |
| "reward": 1.5561109781265259, | |
| "reward_std": 0.18579518795013428, | |
| "rewards/accuracy_reward": 0.5561109781265259, | |
| "rewards/format_reward": 1.0, | |
| "step": 253 | |
| }, | |
| { | |
| "all_correct": 0.5, | |
| "all_wrong": 0.21875, | |
| "completion_length": 86.5078125, | |
| "epoch": 0.48015122873345933, | |
| "grad_norm": 3.4092659487344292, | |
| "kl": 0.050048828125, | |
| "learning_rate": 1.0623163508820976e-06, | |
| "loss": 0.002, | |
| "reward": 1.5766924619674683, | |
| "reward_std": 0.10343727469444275, | |
| "rewards/accuracy_reward": 0.5766924619674683, | |
| "rewards/format_reward": 1.0, | |
| "step": 254 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.15625, | |
| "completion_length": 89.734375, | |
| "epoch": 0.4820415879017013, | |
| "grad_norm": 1.4529133237212937, | |
| "kl": 0.0517578125, | |
| "learning_rate": 1.0563880905946158e-06, | |
| "loss": 0.0021, | |
| "reward": 1.6530107259750366, | |
| "reward_std": 0.17242193222045898, | |
| "rewards/accuracy_reward": 0.6530107259750366, | |
| "rewards/format_reward": 1.0, | |
| "step": 255 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.09375, | |
| "completion_length": 85.3359375, | |
| "epoch": 0.4839319470699433, | |
| "grad_norm": 1.6167911610708365, | |
| "kl": 0.0478515625, | |
| "learning_rate": 1.0504578415831394e-06, | |
| "loss": 0.0019, | |
| "reward": 1.7061023712158203, | |
| "reward_std": 0.1580139398574829, | |
| "rewards/accuracy_reward": 0.7061024904251099, | |
| "rewards/format_reward": 1.0, | |
| "step": 256 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.1875, | |
| "completion_length": 89.16015625, | |
| "epoch": 0.48582230623818523, | |
| "grad_norm": 3.1031477511238963, | |
| "kl": 0.051513671875, | |
| "learning_rate": 1.0445258129987204e-06, | |
| "loss": 0.0021, | |
| "reward": 1.5749967098236084, | |
| "reward_std": 0.12671510875225067, | |
| "rewards/accuracy_reward": 0.5749967098236084, | |
| "rewards/format_reward": 1.0, | |
| "step": 257 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.21875, | |
| "completion_length": 97.05078125, | |
| "epoch": 0.4877126654064272, | |
| "grad_norm": 1.587621420217378, | |
| "kl": 0.0478515625, | |
| "learning_rate": 1.0385922140551751e-06, | |
| "loss": 0.0019, | |
| "reward": 1.5610603094100952, | |
| "reward_std": 0.15413255989551544, | |
| "rewards/accuracy_reward": 0.5610603094100952, | |
| "rewards/format_reward": 1.0, | |
| "step": 258 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.1875, | |
| "completion_length": 92.26171875, | |
| "epoch": 0.4896030245746692, | |
| "grad_norm": 1.4717842344274057, | |
| "kl": 0.05078125, | |
| "learning_rate": 1.0326572540217027e-06, | |
| "loss": 0.002, | |
| "reward": 1.5245153903961182, | |
| "reward_std": 0.1461203396320343, | |
| "rewards/accuracy_reward": 0.5245153307914734, | |
| "rewards/format_reward": 1.0, | |
| "step": 259 | |
| }, | |
| { | |
| "all_correct": 0.59375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 94.67578125, | |
| "epoch": 0.4914933837429111, | |
| "grad_norm": 1.2872025757031114, | |
| "kl": 0.048828125, | |
| "learning_rate": 1.026721142215507e-06, | |
| "loss": 0.002, | |
| "reward": 1.6810582876205444, | |
| "reward_std": 0.0886523649096489, | |
| "rewards/accuracy_reward": 0.6810582876205444, | |
| "rewards/format_reward": 1.0, | |
| "step": 260 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 88.6640625, | |
| "epoch": 0.4933837429111531, | |
| "grad_norm": 2.8413160569475533, | |
| "kl": 0.04931640625, | |
| "learning_rate": 1.0207840879944122e-06, | |
| "loss": 0.002, | |
| "reward": 1.634856104850769, | |
| "reward_std": 0.16094039380550385, | |
| "rewards/accuracy_reward": 0.634856104850769, | |
| "rewards/format_reward": 1.0, | |
| "step": 261 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.3125, | |
| "completion_length": 92.28515625, | |
| "epoch": 0.4952741020793951, | |
| "grad_norm": 1.3820672801691911, | |
| "kl": 0.046875, | |
| "learning_rate": 1.014846300749481e-06, | |
| "loss": 0.0019, | |
| "reward": 1.5555245876312256, | |
| "reward_std": 0.14345505833625793, | |
| "rewards/accuracy_reward": 0.5555245876312256, | |
| "rewards/format_reward": 1.0, | |
| "step": 262 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.09375, | |
| "completion_length": 90.9765625, | |
| "epoch": 0.497164461247637, | |
| "grad_norm": 2.081869839864706, | |
| "kl": 0.051025390625, | |
| "learning_rate": 1.0089079898976282e-06, | |
| "loss": 0.002, | |
| "reward": 1.6466023921966553, | |
| "reward_std": 0.1819521188735962, | |
| "rewards/accuracy_reward": 0.6466023921966553, | |
| "rewards/format_reward": 1.0, | |
| "step": 263 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.21875, | |
| "completion_length": 94.72265625, | |
| "epoch": 0.499054820415879, | |
| "grad_norm": 2.580070748827961, | |
| "kl": 0.044677734375, | |
| "learning_rate": 1.0029693648742354e-06, | |
| "loss": 0.0018, | |
| "reward": 1.5194728374481201, | |
| "reward_std": 0.21391144394874573, | |
| "rewards/accuracy_reward": 0.5272853970527649, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 264 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.125, | |
| "completion_length": 89.3515625, | |
| "epoch": 0.500945179584121, | |
| "grad_norm": 2.2976847333751, | |
| "kl": 0.05224609375, | |
| "learning_rate": 9.970306351257645e-07, | |
| "loss": 0.0021, | |
| "reward": 1.6085888147354126, | |
| "reward_std": 0.23032766580581665, | |
| "rewards/accuracy_reward": 0.6085888147354126, | |
| "rewards/format_reward": 1.0, | |
| "step": 265 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.25, | |
| "completion_length": 88.63671875, | |
| "epoch": 0.502835538752363, | |
| "grad_norm": 1.5748351110633412, | |
| "kl": 0.0576171875, | |
| "learning_rate": 9.910920101023717e-07, | |
| "loss": 0.0023, | |
| "reward": 1.4456298351287842, | |
| "reward_std": 0.1709638237953186, | |
| "rewards/accuracy_reward": 0.4456298351287842, | |
| "rewards/format_reward": 1.0, | |
| "step": 266 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.125, | |
| "completion_length": 91.58984375, | |
| "epoch": 0.504725897920605, | |
| "grad_norm": 2.770591512834709, | |
| "kl": 0.0517578125, | |
| "learning_rate": 9.851536992505187e-07, | |
| "loss": 0.0021, | |
| "reward": 1.653957724571228, | |
| "reward_std": 0.20085959136486053, | |
| "rewards/accuracy_reward": 0.653957724571228, | |
| "rewards/format_reward": 1.0, | |
| "step": 267 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 91.51953125, | |
| "epoch": 0.5066162570888468, | |
| "grad_norm": 1.6226350288665303, | |
| "kl": 0.047119140625, | |
| "learning_rate": 9.792159120055879e-07, | |
| "loss": 0.0019, | |
| "reward": 1.5789850950241089, | |
| "reward_std": 0.17338192462921143, | |
| "rewards/accuracy_reward": 0.5789849758148193, | |
| "rewards/format_reward": 1.0, | |
| "step": 268 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.34375, | |
| "completion_length": 93.02734375, | |
| "epoch": 0.5085066162570888, | |
| "grad_norm": 1.2047460673900767, | |
| "kl": 0.051513671875, | |
| "learning_rate": 9.732788577844932e-07, | |
| "loss": 0.0021, | |
| "reward": 1.5021183490753174, | |
| "reward_std": 0.14113232493400574, | |
| "rewards/accuracy_reward": 0.5021182894706726, | |
| "rewards/format_reward": 1.0, | |
| "step": 269 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 86.44140625, | |
| "epoch": 0.5103969754253308, | |
| "grad_norm": 3.9170636810231727, | |
| "kl": 0.050537109375, | |
| "learning_rate": 9.673427459782974e-07, | |
| "loss": 0.002, | |
| "reward": 1.5727362632751465, | |
| "reward_std": 0.18539920449256897, | |
| "rewards/accuracy_reward": 0.5727362036705017, | |
| "rewards/format_reward": 1.0, | |
| "step": 270 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 87.015625, | |
| "epoch": 0.5122873345935728, | |
| "grad_norm": 1.366883049982446, | |
| "kl": 0.05078125, | |
| "learning_rate": 9.61407785944825e-07, | |
| "loss": 0.002, | |
| "reward": 1.6010971069335938, | |
| "reward_std": 0.1428610235452652, | |
| "rewards/accuracy_reward": 0.6050034761428833, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 271 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.28125, | |
| "completion_length": 90.48046875, | |
| "epoch": 0.5141776937618148, | |
| "grad_norm": 3.3253184693080673, | |
| "kl": 0.053955078125, | |
| "learning_rate": 9.554741870012795e-07, | |
| "loss": 0.0022, | |
| "reward": 1.478670358657837, | |
| "reward_std": 0.16775619983673096, | |
| "rewards/accuracy_reward": 0.4786703884601593, | |
| "rewards/format_reward": 1.0, | |
| "step": 272 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 86.51171875, | |
| "epoch": 0.5160680529300568, | |
| "grad_norm": 1.549632672045942, | |
| "kl": 0.048095703125, | |
| "learning_rate": 9.495421584168608e-07, | |
| "loss": 0.0019, | |
| "reward": 1.594543695449829, | |
| "reward_std": 0.18858283758163452, | |
| "rewards/accuracy_reward": 0.5984500050544739, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 273 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.25, | |
| "completion_length": 84.9453125, | |
| "epoch": 0.5179584120982986, | |
| "grad_norm": 2.314889573705669, | |
| "kl": 0.050048828125, | |
| "learning_rate": 9.436119094053845e-07, | |
| "loss": 0.002, | |
| "reward": 1.5329444408416748, | |
| "reward_std": 0.19728565216064453, | |
| "rewards/accuracy_reward": 0.5329445004463196, | |
| "rewards/format_reward": 1.0, | |
| "step": 274 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.09375, | |
| "completion_length": 91.96484375, | |
| "epoch": 0.5198487712665406, | |
| "grad_norm": 3.0057549499090004, | |
| "kl": 0.046142578125, | |
| "learning_rate": 9.376836491179027e-07, | |
| "loss": 0.0018, | |
| "reward": 1.6376956701278687, | |
| "reward_std": 0.187890887260437, | |
| "rewards/accuracy_reward": 0.6376956701278687, | |
| "rewards/format_reward": 1.0, | |
| "step": 275 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.25, | |
| "completion_length": 83.40234375, | |
| "epoch": 0.5217391304347826, | |
| "grad_norm": 2.381324526997287, | |
| "kl": 0.04931640625, | |
| "learning_rate": 9.317575866353291e-07, | |
| "loss": 0.002, | |
| "reward": 1.5451953411102295, | |
| "reward_std": 0.15440954267978668, | |
| "rewards/accuracy_reward": 0.5451953411102295, | |
| "rewards/format_reward": 1.0, | |
| "step": 276 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.1875, | |
| "completion_length": 93.62109375, | |
| "epoch": 0.5236294896030246, | |
| "grad_norm": 1.6038678148688907, | |
| "kl": 0.046630859375, | |
| "learning_rate": 9.258339309610636e-07, | |
| "loss": 0.0019, | |
| "reward": 1.5747730731964111, | |
| "reward_std": 0.24761907756328583, | |
| "rewards/accuracy_reward": 0.5786792635917664, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 277 | |
| }, | |
| { | |
| "all_correct": 0.21875, | |
| "all_wrong": 0.21875, | |
| "completion_length": 92.5625, | |
| "epoch": 0.5255198487712666, | |
| "grad_norm": 2.367357867898568, | |
| "kl": 0.049072265625, | |
| "learning_rate": 9.199128910136218e-07, | |
| "loss": 0.002, | |
| "reward": 1.4614596366882324, | |
| "reward_std": 0.24768300354480743, | |
| "rewards/accuracy_reward": 0.4653658866882324, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 278 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 89.54296875, | |
| "epoch": 0.5274102079395085, | |
| "grad_norm": 2.2126449160143147, | |
| "kl": 0.043701171875, | |
| "learning_rate": 9.139946756192662e-07, | |
| "loss": 0.0018, | |
| "reward": 1.5789459943771362, | |
| "reward_std": 0.17482253909111023, | |
| "rewards/accuracy_reward": 0.5789459943771362, | |
| "rewards/format_reward": 1.0, | |
| "step": 279 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.21875, | |
| "completion_length": 91.51953125, | |
| "epoch": 0.5293005671077504, | |
| "grad_norm": 1.7672859098305684, | |
| "kl": 0.051025390625, | |
| "learning_rate": 9.08079493504642e-07, | |
| "loss": 0.002, | |
| "reward": 1.5148652791976929, | |
| "reward_std": 0.21231761574745178, | |
| "rewards/accuracy_reward": 0.5148652791976929, | |
| "rewards/format_reward": 1.0, | |
| "step": 280 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.125, | |
| "completion_length": 89.3828125, | |
| "epoch": 0.5311909262759924, | |
| "grad_norm": 1.4247234430692624, | |
| "kl": 0.046875, | |
| "learning_rate": 9.021675532894144e-07, | |
| "loss": 0.0019, | |
| "reward": 1.6473538875579834, | |
| "reward_std": 0.14679107069969177, | |
| "rewards/accuracy_reward": 0.6512601971626282, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 281 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 85.79296875, | |
| "epoch": 0.5330812854442344, | |
| "grad_norm": 1.3257824153478301, | |
| "kl": 0.048828125, | |
| "learning_rate": 8.962590634789123e-07, | |
| "loss": 0.002, | |
| "reward": 1.6150450706481934, | |
| "reward_std": 0.1819513887166977, | |
| "rewards/accuracy_reward": 0.6150450706481934, | |
| "rewards/format_reward": 1.0, | |
| "step": 282 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.1875, | |
| "completion_length": 85.34375, | |
| "epoch": 0.5349716446124764, | |
| "grad_norm": 1.6854728753752273, | |
| "kl": 0.05078125, | |
| "learning_rate": 8.903542324567735e-07, | |
| "loss": 0.002, | |
| "reward": 1.5195235013961792, | |
| "reward_std": 0.20223002135753632, | |
| "rewards/accuracy_reward": 0.5390547513961792, | |
| "rewards/format_reward": 0.98046875, | |
| "step": 283 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 91.28515625, | |
| "epoch": 0.5368620037807184, | |
| "grad_norm": 1.8405481449092091, | |
| "kl": 0.05419921875, | |
| "learning_rate": 8.844532684775963e-07, | |
| "loss": 0.0022, | |
| "reward": 1.592590093612671, | |
| "reward_std": 0.1902393102645874, | |
| "rewards/accuracy_reward": 0.5925900340080261, | |
| "rewards/format_reward": 1.0, | |
| "step": 284 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 92.78125, | |
| "epoch": 0.5387523629489603, | |
| "grad_norm": 3.7950521224218687, | |
| "kl": 0.044189453125, | |
| "learning_rate": 8.785563796595938e-07, | |
| "loss": 0.0018, | |
| "reward": 1.6031997203826904, | |
| "reward_std": 0.19717274606227875, | |
| "rewards/accuracy_reward": 0.6031997203826904, | |
| "rewards/format_reward": 1.0, | |
| "step": 285 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.25, | |
| "completion_length": 81.3046875, | |
| "epoch": 0.5406427221172023, | |
| "grad_norm": 2.3590193160665893, | |
| "kl": 0.061767578125, | |
| "learning_rate": 8.726637739772541e-07, | |
| "loss": 0.0025, | |
| "reward": 1.675480842590332, | |
| "reward_std": 0.10290536284446716, | |
| "rewards/accuracy_reward": 0.6754807829856873, | |
| "rewards/format_reward": 1.0, | |
| "step": 286 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.28125, | |
| "completion_length": 84.125, | |
| "epoch": 0.5425330812854442, | |
| "grad_norm": 1.2126067200747002, | |
| "kl": 0.055908203125, | |
| "learning_rate": 8.667756592540063e-07, | |
| "loss": 0.0022, | |
| "reward": 1.5611882209777832, | |
| "reward_std": 0.11617802083492279, | |
| "rewards/accuracy_reward": 0.5611881017684937, | |
| "rewards/format_reward": 1.0, | |
| "step": 287 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 89.6796875, | |
| "epoch": 0.5444234404536862, | |
| "grad_norm": 1.966929179296431, | |
| "kl": 0.045166015625, | |
| "learning_rate": 8.608922431548887e-07, | |
| "loss": 0.0018, | |
| "reward": 1.6376736164093018, | |
| "reward_std": 0.18406596779823303, | |
| "rewards/accuracy_reward": 0.6376736760139465, | |
| "rewards/format_reward": 1.0, | |
| "step": 288 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.125, | |
| "completion_length": 94.79296875, | |
| "epoch": 0.5463137996219282, | |
| "grad_norm": 1.756150509577632, | |
| "kl": 0.0400390625, | |
| "learning_rate": 8.550137331792269e-07, | |
| "loss": 0.0016, | |
| "reward": 1.6595051288604736, | |
| "reward_std": 0.24478332698345184, | |
| "rewards/accuracy_reward": 0.6673176884651184, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 289 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.34375, | |
| "completion_length": 87.40625, | |
| "epoch": 0.5482041587901701, | |
| "grad_norm": 1.9775373962167517, | |
| "kl": 0.052978515625, | |
| "learning_rate": 8.49140336653315e-07, | |
| "loss": 0.0021, | |
| "reward": 1.4470252990722656, | |
| "reward_std": 0.17483514547348022, | |
| "rewards/accuracy_reward": 0.4470253586769104, | |
| "rewards/format_reward": 1.0, | |
| "step": 290 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.125, | |
| "completion_length": 88.69921875, | |
| "epoch": 0.5500945179584121, | |
| "grad_norm": 2.063086280742146, | |
| "kl": 0.051025390625, | |
| "learning_rate": 8.432722607231029e-07, | |
| "loss": 0.002, | |
| "reward": 1.6172977685928345, | |
| "reward_std": 0.20396284759044647, | |
| "rewards/accuracy_reward": 0.6172977685928345, | |
| "rewards/format_reward": 1.0, | |
| "step": 291 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.0625, | |
| "completion_length": 99.53515625, | |
| "epoch": 0.5519848771266541, | |
| "grad_norm": 2.621542277949528, | |
| "kl": 0.04150390625, | |
| "learning_rate": 8.374097123468917e-07, | |
| "loss": 0.0017, | |
| "reward": 1.5667483806610107, | |
| "reward_std": 0.24786125123500824, | |
| "rewards/accuracy_reward": 0.5706546306610107, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 292 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.15625, | |
| "completion_length": 86.796875, | |
| "epoch": 0.553875236294896, | |
| "grad_norm": 2.8165649295162316, | |
| "kl": 0.048095703125, | |
| "learning_rate": 8.315528982880337e-07, | |
| "loss": 0.0019, | |
| "reward": 1.5577614307403564, | |
| "reward_std": 0.25827598571777344, | |
| "rewards/accuracy_reward": 0.5655738711357117, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 293 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.1875, | |
| "completion_length": 87.5390625, | |
| "epoch": 0.555765595463138, | |
| "grad_norm": 2.0837535994437473, | |
| "kl": 0.0439453125, | |
| "learning_rate": 8.257020251076392e-07, | |
| "loss": 0.0018, | |
| "reward": 1.5403378009796143, | |
| "reward_std": 0.2276870459318161, | |
| "rewards/accuracy_reward": 0.5403377413749695, | |
| "rewards/format_reward": 1.0, | |
| "step": 294 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 93.94921875, | |
| "epoch": 0.55765595463138, | |
| "grad_norm": 3.9566867940245594, | |
| "kl": 0.05029296875, | |
| "learning_rate": 8.198572991572939e-07, | |
| "loss": 0.002, | |
| "reward": 1.6211934089660645, | |
| "reward_std": 0.21309423446655273, | |
| "rewards/accuracy_reward": 0.6290059089660645, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 295 | |
| }, | |
| { | |
| "all_correct": 0.21875, | |
| "all_wrong": 0.28125, | |
| "completion_length": 100.11328125, | |
| "epoch": 0.5595463137996219, | |
| "grad_norm": 1.6039871922459095, | |
| "kl": 0.046142578125, | |
| "learning_rate": 8.140189265717793e-07, | |
| "loss": 0.0018, | |
| "reward": 1.3850700855255127, | |
| "reward_std": 0.21388718485832214, | |
| "rewards/accuracy_reward": 0.42413264513015747, | |
| "rewards/format_reward": 0.9609375, | |
| "step": 296 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.34375, | |
| "completion_length": 92.2734375, | |
| "epoch": 0.5614366729678639, | |
| "grad_norm": 1.4495763345559223, | |
| "kl": 0.05126953125, | |
| "learning_rate": 8.081871132618035e-07, | |
| "loss": 0.0021, | |
| "reward": 1.4881727695465088, | |
| "reward_std": 0.1367965191602707, | |
| "rewards/accuracy_reward": 0.4920789897441864, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 297 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.09375, | |
| "completion_length": 78.7109375, | |
| "epoch": 0.5633270321361059, | |
| "grad_norm": 2.195186142439845, | |
| "kl": 0.054443359375, | |
| "learning_rate": 8.023620649067383e-07, | |
| "loss": 0.0022, | |
| "reward": 1.6418862342834473, | |
| "reward_std": 0.17705166339874268, | |
| "rewards/accuracy_reward": 0.6418863534927368, | |
| "rewards/format_reward": 1.0, | |
| "step": 298 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.25, | |
| "completion_length": 95.70703125, | |
| "epoch": 0.5652173913043478, | |
| "grad_norm": 1.5458362887848978, | |
| "kl": 0.044677734375, | |
| "learning_rate": 7.965439869473663e-07, | |
| "loss": 0.0018, | |
| "reward": 1.5705434083938599, | |
| "reward_std": 0.1792255938053131, | |
| "rewards/accuracy_reward": 0.5705434083938599, | |
| "rewards/format_reward": 1.0, | |
| "step": 299 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.25, | |
| "completion_length": 91.9296875, | |
| "epoch": 0.5671077504725898, | |
| "grad_norm": 2.0503603995341266, | |
| "kl": 0.04541015625, | |
| "learning_rate": 7.907330845786337e-07, | |
| "loss": 0.0018, | |
| "reward": 1.5330439805984497, | |
| "reward_std": 0.1628941148519516, | |
| "rewards/accuracy_reward": 0.5564814805984497, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 300 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.15625, | |
| "completion_length": 86.5078125, | |
| "epoch": 0.5689981096408318, | |
| "grad_norm": 1.4513016708834336, | |
| "kl": 0.0478515625, | |
| "learning_rate": 7.849295627424147e-07, | |
| "loss": 0.0019, | |
| "reward": 1.6002389192581177, | |
| "reward_std": 0.16803400218486786, | |
| "rewards/accuracy_reward": 0.6158639192581177, | |
| "rewards/format_reward": 0.984375, | |
| "step": 301 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.15625, | |
| "completion_length": 85.36328125, | |
| "epoch": 0.5708884688090737, | |
| "grad_norm": 2.0386539297024187, | |
| "kl": 0.050537109375, | |
| "learning_rate": 7.791336261202834e-07, | |
| "loss": 0.002, | |
| "reward": 1.6394249200820923, | |
| "reward_std": 0.18532907962799072, | |
| "rewards/accuracy_reward": 0.6550499200820923, | |
| "rewards/format_reward": 0.984375, | |
| "step": 302 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.25, | |
| "completion_length": 91.37109375, | |
| "epoch": 0.5727788279773157, | |
| "grad_norm": 1.7416561634122243, | |
| "kl": 0.047119140625, | |
| "learning_rate": 7.733454791262945e-07, | |
| "loss": 0.0019, | |
| "reward": 1.5273735523223877, | |
| "reward_std": 0.1830909550189972, | |
| "rewards/accuracy_reward": 0.5273735523223877, | |
| "rewards/format_reward": 1.0, | |
| "step": 303 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 93.0390625, | |
| "epoch": 0.5746691871455577, | |
| "grad_norm": 1.4257574938910071, | |
| "kl": 0.04931640625, | |
| "learning_rate": 7.67565325899774e-07, | |
| "loss": 0.002, | |
| "reward": 1.6209030151367188, | |
| "reward_std": 0.1880435049533844, | |
| "rewards/accuracy_reward": 0.6209030747413635, | |
| "rewards/format_reward": 1.0, | |
| "step": 304 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.09375, | |
| "completion_length": 94.640625, | |
| "epoch": 0.5765595463137996, | |
| "grad_norm": 2.523907663059568, | |
| "kl": 0.0478515625, | |
| "learning_rate": 7.617933702981197e-07, | |
| "loss": 0.0019, | |
| "reward": 1.646308183670044, | |
| "reward_std": 0.20141802728176117, | |
| "rewards/accuracy_reward": 0.650214433670044, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 305 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.28125, | |
| "completion_length": 88.98828125, | |
| "epoch": 0.5784499054820416, | |
| "grad_norm": 1.8869992601885726, | |
| "kl": 0.052978515625, | |
| "learning_rate": 7.560298158896114e-07, | |
| "loss": 0.0021, | |
| "reward": 1.5234375, | |
| "reward_std": 0.19467194378376007, | |
| "rewards/accuracy_reward": 0.52734375, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 306 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.125, | |
| "completion_length": 91.984375, | |
| "epoch": 0.5803402646502835, | |
| "grad_norm": 1.593500363249208, | |
| "kl": 0.050537109375, | |
| "learning_rate": 7.50274865946231e-07, | |
| "loss": 0.002, | |
| "reward": 1.6522129774093628, | |
| "reward_std": 0.15789154171943665, | |
| "rewards/accuracy_reward": 0.6522129774093628, | |
| "rewards/format_reward": 1.0, | |
| "step": 307 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.125, | |
| "completion_length": 89.9140625, | |
| "epoch": 0.5822306238185255, | |
| "grad_norm": 1.5345766665959373, | |
| "kl": 0.046630859375, | |
| "learning_rate": 7.445287234364945e-07, | |
| "loss": 0.0019, | |
| "reward": 1.6170084476470947, | |
| "reward_std": 0.1939898431301117, | |
| "rewards/accuracy_reward": 0.61700838804245, | |
| "rewards/format_reward": 1.0, | |
| "step": 308 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.25, | |
| "completion_length": 91.09375, | |
| "epoch": 0.5841209829867675, | |
| "grad_norm": 1.633714349616306, | |
| "kl": 0.04541015625, | |
| "learning_rate": 7.38791591018292e-07, | |
| "loss": 0.0018, | |
| "reward": 1.5211251974105835, | |
| "reward_std": 0.19408775866031647, | |
| "rewards/accuracy_reward": 0.5367502570152283, | |
| "rewards/format_reward": 0.984375, | |
| "step": 309 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 94.28515625, | |
| "epoch": 0.5860113421550095, | |
| "grad_norm": 1.7832966629730098, | |
| "kl": 0.044677734375, | |
| "learning_rate": 7.330636710317417e-07, | |
| "loss": 0.0018, | |
| "reward": 1.6072568893432617, | |
| "reward_std": 0.1647563874721527, | |
| "rewards/accuracy_reward": 0.6189756989479065, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 310 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.125, | |
| "completion_length": 89.41796875, | |
| "epoch": 0.5879017013232514, | |
| "grad_norm": 2.1849079934738294, | |
| "kl": 0.047607421875, | |
| "learning_rate": 7.27345165492053e-07, | |
| "loss": 0.0019, | |
| "reward": 1.5788297653198242, | |
| "reward_std": 0.18523138761520386, | |
| "rewards/accuracy_reward": 0.5788298845291138, | |
| "rewards/format_reward": 1.0, | |
| "step": 311 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 98.234375, | |
| "epoch": 0.5897920604914934, | |
| "grad_norm": 1.432954670732317, | |
| "kl": 0.044189453125, | |
| "learning_rate": 7.216362760824009e-07, | |
| "loss": 0.0018, | |
| "reward": 1.6121280193328857, | |
| "reward_std": 0.17768144607543945, | |
| "rewards/accuracy_reward": 0.612127959728241, | |
| "rewards/format_reward": 1.0, | |
| "step": 312 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.125, | |
| "completion_length": 85.05078125, | |
| "epoch": 0.5916824196597353, | |
| "grad_norm": 1.7095977165647167, | |
| "kl": 0.0517578125, | |
| "learning_rate": 7.159372041468149e-07, | |
| "loss": 0.0021, | |
| "reward": 1.6863864660263062, | |
| "reward_std": 0.14531907439231873, | |
| "rewards/accuracy_reward": 0.6863864660263062, | |
| "rewards/format_reward": 1.0, | |
| "step": 313 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.21875, | |
| "completion_length": 89.55078125, | |
| "epoch": 0.5935727788279773, | |
| "grad_norm": 1.7588812930279742, | |
| "kl": 0.0439453125, | |
| "learning_rate": 7.102481506830763e-07, | |
| "loss": 0.0018, | |
| "reward": 1.4836997985839844, | |
| "reward_std": 0.21439874172210693, | |
| "rewards/accuracy_reward": 0.48369988799095154, | |
| "rewards/format_reward": 1.0, | |
| "step": 314 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.28125, | |
| "completion_length": 93.41015625, | |
| "epoch": 0.5954631379962193, | |
| "grad_norm": 2.719325851548355, | |
| "kl": 0.04052734375, | |
| "learning_rate": 7.045693163356299e-07, | |
| "loss": 0.0016, | |
| "reward": 1.5147807598114014, | |
| "reward_std": 0.16246028244495392, | |
| "rewards/accuracy_reward": 0.5147807002067566, | |
| "rewards/format_reward": 1.0, | |
| "step": 315 | |
| }, | |
| { | |
| "all_correct": 0.21875, | |
| "all_wrong": 0.28125, | |
| "completion_length": 101.6171875, | |
| "epoch": 0.5973534971644613, | |
| "grad_norm": 12.827054251723668, | |
| "kl": 0.046875, | |
| "learning_rate": 6.989009013885076e-07, | |
| "loss": 0.0019, | |
| "reward": 1.442307710647583, | |
| "reward_std": 0.24046628177165985, | |
| "rewards/accuracy_reward": 0.457932710647583, | |
| "rewards/format_reward": 0.984375, | |
| "step": 316 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.1875, | |
| "completion_length": 87.93359375, | |
| "epoch": 0.5992438563327032, | |
| "grad_norm": 1.6360722613389922, | |
| "kl": 0.055908203125, | |
| "learning_rate": 6.932431057582646e-07, | |
| "loss": 0.0022, | |
| "reward": 1.6484375, | |
| "reward_std": 0.16018126904964447, | |
| "rewards/accuracy_reward": 0.6484375, | |
| "rewards/format_reward": 1.0, | |
| "step": 317 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.1875, | |
| "completion_length": 104.6484375, | |
| "epoch": 0.6011342155009451, | |
| "grad_norm": 1.8443996597658288, | |
| "kl": 0.04345703125, | |
| "learning_rate": 6.875961289869282e-07, | |
| "loss": 0.0017, | |
| "reward": 1.5236172676086426, | |
| "reward_std": 0.20754508674144745, | |
| "rewards/accuracy_reward": 0.5236173272132874, | |
| "rewards/format_reward": 1.0, | |
| "step": 318 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 83.109375, | |
| "epoch": 0.6030245746691871, | |
| "grad_norm": 2.136504314601804, | |
| "kl": 0.051025390625, | |
| "learning_rate": 6.819601702349608e-07, | |
| "loss": 0.002, | |
| "reward": 1.6519629955291748, | |
| "reward_std": 0.15085574984550476, | |
| "rewards/accuracy_reward": 0.6519629955291748, | |
| "rewards/format_reward": 1.0, | |
| "step": 319 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.0625, | |
| "completion_length": 93.57421875, | |
| "epoch": 0.6049149338374291, | |
| "grad_norm": 1.4937943209360285, | |
| "kl": 0.042236328125, | |
| "learning_rate": 6.763354282742362e-07, | |
| "loss": 0.0017, | |
| "reward": 1.628268837928772, | |
| "reward_std": 0.17435705661773682, | |
| "rewards/accuracy_reward": 0.636081337928772, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 320 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 90.71875, | |
| "epoch": 0.6068052930056711, | |
| "grad_norm": 8.501242318953185, | |
| "kl": 0.051513671875, | |
| "learning_rate": 6.707221014810278e-07, | |
| "loss": 0.0021, | |
| "reward": 1.5977280139923096, | |
| "reward_std": 0.18674521148204803, | |
| "rewards/accuracy_reward": 0.59772789478302, | |
| "rewards/format_reward": 1.0, | |
| "step": 321 | |
| }, | |
| { | |
| "all_correct": 0.53125, | |
| "all_wrong": 0.15625, | |
| "completion_length": 100.10546875, | |
| "epoch": 0.6086956521739131, | |
| "grad_norm": 1.126898780019968, | |
| "kl": 0.046875, | |
| "learning_rate": 6.651203878290138e-07, | |
| "loss": 0.0019, | |
| "reward": 1.6740057468414307, | |
| "reward_std": 0.13990236818790436, | |
| "rewards/accuracy_reward": 0.6740056872367859, | |
| "rewards/format_reward": 1.0, | |
| "step": 322 | |
| }, | |
| { | |
| "all_correct": 0.5, | |
| "all_wrong": 0.25, | |
| "completion_length": 87.5625, | |
| "epoch": 0.610586011342155, | |
| "grad_norm": 1.0829161392709314, | |
| "kl": 0.0537109375, | |
| "learning_rate": 6.59530484882294e-07, | |
| "loss": 0.0021, | |
| "reward": 1.6339197158813477, | |
| "reward_std": 0.10196228325366974, | |
| "rewards/accuracy_reward": 0.6339195966720581, | |
| "rewards/format_reward": 1.0, | |
| "step": 323 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.21875, | |
| "completion_length": 85.04296875, | |
| "epoch": 0.6124763705103969, | |
| "grad_norm": 1.5071308132082326, | |
| "kl": 0.048095703125, | |
| "learning_rate": 6.539525897884218e-07, | |
| "loss": 0.0019, | |
| "reward": 1.601670503616333, | |
| "reward_std": 0.11899926513433456, | |
| "rewards/accuracy_reward": 0.6016704440116882, | |
| "rewards/format_reward": 1.0, | |
| "step": 324 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.25, | |
| "completion_length": 105.0546875, | |
| "epoch": 0.6143667296786389, | |
| "grad_norm": 1.3796535779838177, | |
| "kl": 0.04638671875, | |
| "learning_rate": 6.48386899271452e-07, | |
| "loss": 0.0019, | |
| "reward": 1.5369006395339966, | |
| "reward_std": 0.1829143464565277, | |
| "rewards/accuracy_reward": 0.5486193895339966, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 325 | |
| }, | |
| { | |
| "all_correct": 0.53125, | |
| "all_wrong": 0.21875, | |
| "completion_length": 91.96875, | |
| "epoch": 0.6162570888468809, | |
| "grad_norm": 1.0665567955819852, | |
| "kl": 0.043701171875, | |
| "learning_rate": 6.428336096250017e-07, | |
| "loss": 0.0018, | |
| "reward": 1.632015585899353, | |
| "reward_std": 0.08141334354877472, | |
| "rewards/accuracy_reward": 0.6320155262947083, | |
| "rewards/format_reward": 1.0, | |
| "step": 326 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.15625, | |
| "completion_length": 98.70703125, | |
| "epoch": 0.6181474480151229, | |
| "grad_norm": 1.8190601838814684, | |
| "kl": 0.0498046875, | |
| "learning_rate": 6.372929167053285e-07, | |
| "loss": 0.002, | |
| "reward": 1.5398609638214111, | |
| "reward_std": 0.19976115226745605, | |
| "rewards/accuracy_reward": 0.5398609638214111, | |
| "rewards/format_reward": 1.0, | |
| "step": 327 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.09375, | |
| "completion_length": 96.3984375, | |
| "epoch": 0.6200378071833649, | |
| "grad_norm": 10.48494019650004, | |
| "kl": 0.05029296875, | |
| "learning_rate": 6.317650159244212e-07, | |
| "loss": 0.002, | |
| "reward": 1.611169457435608, | |
| "reward_std": 0.17813417315483093, | |
| "rewards/accuracy_reward": 0.6150757074356079, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 328 | |
| }, | |
| { | |
| "all_correct": 0.5, | |
| "all_wrong": 0.125, | |
| "completion_length": 90.19921875, | |
| "epoch": 0.6219281663516069, | |
| "grad_norm": 3.3256774582394724, | |
| "kl": 0.05029296875, | |
| "learning_rate": 6.262501022431099e-07, | |
| "loss": 0.002, | |
| "reward": 1.6589438915252686, | |
| "reward_std": 0.12664872407913208, | |
| "rewards/accuracy_reward": 0.6628501415252686, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 329 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.03125, | |
| "completion_length": 100.25390625, | |
| "epoch": 0.6238185255198487, | |
| "grad_norm": 1.4299757860972675, | |
| "kl": 0.04052734375, | |
| "learning_rate": 6.207483701641887e-07, | |
| "loss": 0.0016, | |
| "reward": 1.7447913885116577, | |
| "reward_std": 0.18475459516048431, | |
| "rewards/accuracy_reward": 0.7447913885116577, | |
| "rewards/format_reward": 1.0, | |
| "step": 330 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.375, | |
| "completion_length": 91.765625, | |
| "epoch": 0.6257088846880907, | |
| "grad_norm": 1.1358781168676717, | |
| "kl": 0.0478515625, | |
| "learning_rate": 6.15260013725555e-07, | |
| "loss": 0.0019, | |
| "reward": 1.4526742696762085, | |
| "reward_std": 0.11981412023305893, | |
| "rewards/accuracy_reward": 0.4526742696762085, | |
| "rewards/format_reward": 1.0, | |
| "step": 331 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.15625, | |
| "completion_length": 98.609375, | |
| "epoch": 0.6275992438563327, | |
| "grad_norm": 2.5447540141563048, | |
| "kl": 0.048828125, | |
| "learning_rate": 6.097852264933696e-07, | |
| "loss": 0.002, | |
| "reward": 1.619698166847229, | |
| "reward_std": 0.16045129299163818, | |
| "rewards/accuracy_reward": 0.6275107860565186, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 332 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.25, | |
| "completion_length": 91.7265625, | |
| "epoch": 0.6294896030245747, | |
| "grad_norm": 1.5705944493635686, | |
| "kl": 0.0478515625, | |
| "learning_rate": 6.043242015552256e-07, | |
| "loss": 0.0019, | |
| "reward": 1.52734375, | |
| "reward_std": 0.21317726373672485, | |
| "rewards/accuracy_reward": 0.52734375, | |
| "rewards/format_reward": 1.0, | |
| "step": 333 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 93.20703125, | |
| "epoch": 0.6313799621928167, | |
| "grad_norm": 1.4532227455775275, | |
| "kl": 0.039306640625, | |
| "learning_rate": 5.988771315133417e-07, | |
| "loss": 0.0016, | |
| "reward": 1.5797886848449707, | |
| "reward_std": 0.17210961878299713, | |
| "rewards/accuracy_reward": 0.5797887444496155, | |
| "rewards/format_reward": 1.0, | |
| "step": 334 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.15625, | |
| "completion_length": 89.7421875, | |
| "epoch": 0.6332703213610587, | |
| "grad_norm": 1.5032464010247504, | |
| "kl": 0.04150390625, | |
| "learning_rate": 5.934442084777675e-07, | |
| "loss": 0.0017, | |
| "reward": 1.6588280200958252, | |
| "reward_std": 0.17925553023815155, | |
| "rewards/accuracy_reward": 0.6588280200958252, | |
| "rewards/format_reward": 1.0, | |
| "step": 335 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 89.48046875, | |
| "epoch": 0.6351606805293005, | |
| "grad_norm": 3.9562339582094346, | |
| "kl": 0.056640625, | |
| "learning_rate": 5.880256240596095e-07, | |
| "loss": 0.0023, | |
| "reward": 1.5489052534103394, | |
| "reward_std": 0.1494232714176178, | |
| "rewards/accuracy_reward": 0.5489052534103394, | |
| "rewards/format_reward": 1.0, | |
| "step": 336 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.21875, | |
| "completion_length": 91.6171875, | |
| "epoch": 0.6370510396975425, | |
| "grad_norm": 4.504748528156339, | |
| "kl": 0.0537109375, | |
| "learning_rate": 5.826215693642709e-07, | |
| "loss": 0.0021, | |
| "reward": 1.574186086654663, | |
| "reward_std": 0.14129537343978882, | |
| "rewards/accuracy_reward": 0.5780923366546631, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 337 | |
| }, | |
| { | |
| "all_correct": 0.53125, | |
| "all_wrong": 0.25, | |
| "completion_length": 96.40234375, | |
| "epoch": 0.6389413988657845, | |
| "grad_norm": 0.9918255765496156, | |
| "kl": 0.046875, | |
| "learning_rate": 5.772322349847153e-07, | |
| "loss": 0.0019, | |
| "reward": 1.65234375, | |
| "reward_std": 0.09954919666051865, | |
| "rewards/accuracy_reward": 0.65234375, | |
| "rewards/format_reward": 1.0, | |
| "step": 338 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.21875, | |
| "completion_length": 90.94921875, | |
| "epoch": 0.6408317580340265, | |
| "grad_norm": 2.0285380237709387, | |
| "kl": 0.050537109375, | |
| "learning_rate": 5.718578109947409e-07, | |
| "loss": 0.002, | |
| "reward": 1.608215093612671, | |
| "reward_std": 0.10201030969619751, | |
| "rewards/accuracy_reward": 0.6082150340080261, | |
| "rewards/format_reward": 1.0, | |
| "step": 339 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.21875, | |
| "completion_length": 87.05078125, | |
| "epoch": 0.6427221172022685, | |
| "grad_norm": 3.3176011557910097, | |
| "kl": 0.053466796875, | |
| "learning_rate": 5.664984869422802e-07, | |
| "loss": 0.0021, | |
| "reward": 1.5531736612319946, | |
| "reward_std": 0.17685286700725555, | |
| "rewards/accuracy_reward": 0.5531736612319946, | |
| "rewards/format_reward": 1.0, | |
| "step": 340 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.09375, | |
| "completion_length": 100.0234375, | |
| "epoch": 0.6446124763705104, | |
| "grad_norm": 1.9481732665617166, | |
| "kl": 0.049072265625, | |
| "learning_rate": 5.611544518427121e-07, | |
| "loss": 0.002, | |
| "reward": 1.6415621042251587, | |
| "reward_std": 0.163002610206604, | |
| "rewards/accuracy_reward": 0.6493746042251587, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 341 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 91.18359375, | |
| "epoch": 0.6465028355387523, | |
| "grad_norm": 1.9147871985701481, | |
| "kl": 0.04931640625, | |
| "learning_rate": 5.558258941721981e-07, | |
| "loss": 0.002, | |
| "reward": 1.5601630210876465, | |
| "reward_std": 0.17341557145118713, | |
| "rewards/accuracy_reward": 0.5640692710876465, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 342 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.09375, | |
| "completion_length": 86.3515625, | |
| "epoch": 0.6483931947069943, | |
| "grad_norm": 2.2167075436448465, | |
| "kl": 0.05126953125, | |
| "learning_rate": 5.505130018610321e-07, | |
| "loss": 0.002, | |
| "reward": 1.6853443384170532, | |
| "reward_std": 0.20850321650505066, | |
| "rewards/accuracy_reward": 0.7009693384170532, | |
| "rewards/format_reward": 0.984375, | |
| "step": 343 | |
| }, | |
| { | |
| "all_correct": 0.5, | |
| "all_wrong": 0.1875, | |
| "completion_length": 83.95703125, | |
| "epoch": 0.6502835538752363, | |
| "grad_norm": 2.499264160160712, | |
| "kl": 0.055908203125, | |
| "learning_rate": 5.452159622870157e-07, | |
| "loss": 0.0022, | |
| "reward": 1.6708264350891113, | |
| "reward_std": 0.09797890484333038, | |
| "rewards/accuracy_reward": 0.6708264350891113, | |
| "rewards/format_reward": 1.0, | |
| "step": 344 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 98.9375, | |
| "epoch": 0.6521739130434783, | |
| "grad_norm": 1.1468871398817573, | |
| "kl": 0.051025390625, | |
| "learning_rate": 5.399349622688478e-07, | |
| "loss": 0.002, | |
| "reward": 1.5537773370742798, | |
| "reward_std": 0.17344093322753906, | |
| "rewards/accuracy_reward": 0.5772148370742798, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 345 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 91.84765625, | |
| "epoch": 0.6540642722117203, | |
| "grad_norm": 1.365593331621954, | |
| "kl": 0.0498046875, | |
| "learning_rate": 5.346701880595353e-07, | |
| "loss": 0.002, | |
| "reward": 1.6378886699676514, | |
| "reward_std": 0.14274653792381287, | |
| "rewards/accuracy_reward": 0.6378886699676514, | |
| "rewards/format_reward": 1.0, | |
| "step": 346 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.25, | |
| "completion_length": 94.91015625, | |
| "epoch": 0.6559546313799622, | |
| "grad_norm": 1.5634515651271659, | |
| "kl": 0.045654296875, | |
| "learning_rate": 5.29421825339826e-07, | |
| "loss": 0.0018, | |
| "reward": 1.589550495147705, | |
| "reward_std": 0.13411790132522583, | |
| "rewards/accuracy_reward": 0.5973629951477051, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 347 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.25, | |
| "completion_length": 90.8359375, | |
| "epoch": 0.6578449905482041, | |
| "grad_norm": 1.1226897308498045, | |
| "kl": 0.0458984375, | |
| "learning_rate": 5.241900592116579e-07, | |
| "loss": 0.0018, | |
| "reward": 1.5573174953460693, | |
| "reward_std": 0.09218928962945938, | |
| "rewards/accuracy_reward": 0.5573174953460693, | |
| "rewards/format_reward": 1.0, | |
| "step": 348 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 94.4453125, | |
| "epoch": 0.6597353497164461, | |
| "grad_norm": 1.4290088468656539, | |
| "kl": 0.051513671875, | |
| "learning_rate": 5.189750741916326e-07, | |
| "loss": 0.0021, | |
| "reward": 1.623161792755127, | |
| "reward_std": 0.20275253057479858, | |
| "rewards/accuracy_reward": 0.642693042755127, | |
| "rewards/format_reward": 0.98046875, | |
| "step": 349 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.03125, | |
| "completion_length": 96.1015625, | |
| "epoch": 0.6616257088846881, | |
| "grad_norm": 1.8681119609391044, | |
| "kl": 0.048583984375, | |
| "learning_rate": 5.137770542045062e-07, | |
| "loss": 0.0019, | |
| "reward": 1.688063144683838, | |
| "reward_std": 0.23905277252197266, | |
| "rewards/accuracy_reward": 0.6997818946838379, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 350 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.125, | |
| "completion_length": 84.9453125, | |
| "epoch": 0.6635160680529301, | |
| "grad_norm": 1.4728027178236136, | |
| "kl": 0.04833984375, | |
| "learning_rate": 5.085961825767049e-07, | |
| "loss": 0.0019, | |
| "reward": 1.674993872642517, | |
| "reward_std": 0.17204201221466064, | |
| "rewards/accuracy_reward": 0.6789001226425171, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 351 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.21875, | |
| "completion_length": 86.9140625, | |
| "epoch": 0.665406427221172, | |
| "grad_norm": 3.785173448605038, | |
| "kl": 0.04443359375, | |
| "learning_rate": 5.034326420298557e-07, | |
| "loss": 0.0018, | |
| "reward": 1.5950738191604614, | |
| "reward_std": 0.18303368985652924, | |
| "rewards/accuracy_reward": 0.5950738191604614, | |
| "rewards/format_reward": 1.0, | |
| "step": 352 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.375, | |
| "completion_length": 93.01953125, | |
| "epoch": 0.667296786389414, | |
| "grad_norm": 1.4357110637131671, | |
| "kl": 0.0458984375, | |
| "learning_rate": 4.982866146743464e-07, | |
| "loss": 0.0018, | |
| "reward": 1.3835440874099731, | |
| "reward_std": 0.16043886542320251, | |
| "rewards/accuracy_reward": 0.40307533740997314, | |
| "rewards/format_reward": 0.98046875, | |
| "step": 353 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.0625, | |
| "completion_length": 91.83203125, | |
| "epoch": 0.6691871455576559, | |
| "grad_norm": 2.0314351942520203, | |
| "kl": 0.037841796875, | |
| "learning_rate": 4.93158282002899e-07, | |
| "loss": 0.0015, | |
| "reward": 1.691582441329956, | |
| "reward_std": 0.21125006675720215, | |
| "rewards/accuracy_reward": 0.6915825605392456, | |
| "rewards/format_reward": 1.0, | |
| "step": 354 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.3125, | |
| "completion_length": 96.66015625, | |
| "epoch": 0.6710775047258979, | |
| "grad_norm": 1.360692718067674, | |
| "kl": 0.046630859375, | |
| "learning_rate": 4.880478248841706e-07, | |
| "loss": 0.0019, | |
| "reward": 1.4369994401931763, | |
| "reward_std": 0.17168085277080536, | |
| "rewards/accuracy_reward": 0.44090569019317627, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 355 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.25, | |
| "completion_length": 85.65234375, | |
| "epoch": 0.6729678638941399, | |
| "grad_norm": 2.116858815599637, | |
| "kl": 0.047607421875, | |
| "learning_rate": 4.82955423556375e-07, | |
| "loss": 0.0019, | |
| "reward": 1.5795400142669678, | |
| "reward_std": 0.18496635556221008, | |
| "rewards/accuracy_reward": 0.5795398950576782, | |
| "rewards/format_reward": 1.0, | |
| "step": 356 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.28125, | |
| "completion_length": 83.1875, | |
| "epoch": 0.6748582230623819, | |
| "grad_norm": 1.7162608457835138, | |
| "kl": 0.04833984375, | |
| "learning_rate": 4.778812576209241e-07, | |
| "loss": 0.0019, | |
| "reward": 1.5707752704620361, | |
| "reward_std": 0.09887948632240295, | |
| "rewards/accuracy_reward": 0.5707752108573914, | |
| "rewards/format_reward": 1.0, | |
| "step": 357 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.15625, | |
| "completion_length": 88.57421875, | |
| "epoch": 0.6767485822306238, | |
| "grad_norm": 2.366206137579118, | |
| "kl": 0.0439453125, | |
| "learning_rate": 4.728255060360955e-07, | |
| "loss": 0.0018, | |
| "reward": 1.6372836828231812, | |
| "reward_std": 0.17435956001281738, | |
| "rewards/accuracy_reward": 0.6372836828231812, | |
| "rewards/format_reward": 1.0, | |
| "step": 358 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 86.61328125, | |
| "epoch": 0.6786389413988658, | |
| "grad_norm": 1.692332788365531, | |
| "kl": 0.0478515625, | |
| "learning_rate": 4.6778834711071924e-07, | |
| "loss": 0.0019, | |
| "reward": 1.5796103477478027, | |
| "reward_std": 0.15928372740745544, | |
| "rewards/accuracy_reward": 0.5796103477478027, | |
| "rewards/format_reward": 1.0, | |
| "step": 359 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 90.30859375, | |
| "epoch": 0.6805293005671077, | |
| "grad_norm": 1.2973562747312646, | |
| "kl": 0.05029296875, | |
| "learning_rate": 4.627699584978911e-07, | |
| "loss": 0.002, | |
| "reward": 1.6404190063476562, | |
| "reward_std": 0.14580589532852173, | |
| "rewards/accuracy_reward": 0.640419065952301, | |
| "rewards/format_reward": 1.0, | |
| "step": 360 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 91.734375, | |
| "epoch": 0.6824196597353497, | |
| "grad_norm": 1.924682020333693, | |
| "kl": 0.045166015625, | |
| "learning_rate": 4.57770517188705e-07, | |
| "loss": 0.0018, | |
| "reward": 1.5807889699935913, | |
| "reward_std": 0.18111172318458557, | |
| "rewards/accuracy_reward": 0.5846952199935913, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 361 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 92.046875, | |
| "epoch": 0.6843100189035917, | |
| "grad_norm": 1.4869820010117951, | |
| "kl": 0.044921875, | |
| "learning_rate": 4.527901995060113e-07, | |
| "loss": 0.0018, | |
| "reward": 1.6711153984069824, | |
| "reward_std": 0.16736207902431488, | |
| "rewards/accuracy_reward": 0.6789278984069824, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 362 | |
| }, | |
| { | |
| "all_correct": 0.15625, | |
| "all_wrong": 0.25, | |
| "completion_length": 91.83984375, | |
| "epoch": 0.6862003780718336, | |
| "grad_norm": 1.7329104320683526, | |
| "kl": 0.04150390625, | |
| "learning_rate": 4.4782918109819976e-07, | |
| "loss": 0.0017, | |
| "reward": 1.4670283794403076, | |
| "reward_std": 0.24669389426708221, | |
| "rewards/accuracy_reward": 0.4670283794403076, | |
| "rewards/format_reward": 1.0, | |
| "step": 363 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.25, | |
| "completion_length": 93.6328125, | |
| "epoch": 0.6880907372400756, | |
| "grad_norm": 1.5230757953398353, | |
| "kl": 0.052490234375, | |
| "learning_rate": 4.4288763693300226e-07, | |
| "loss": 0.0021, | |
| "reward": 1.6058498620986938, | |
| "reward_std": 0.14699707925319672, | |
| "rewards/accuracy_reward": 0.6175686120986938, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 364 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.21875, | |
| "completion_length": 96.92578125, | |
| "epoch": 0.6899810964083176, | |
| "grad_norm": 4.290738008536141, | |
| "kl": 0.044677734375, | |
| "learning_rate": 4.3796574129132423e-07, | |
| "loss": 0.0018, | |
| "reward": 1.560653805732727, | |
| "reward_std": 0.18818287551403046, | |
| "rewards/accuracy_reward": 0.580185055732727, | |
| "rewards/format_reward": 0.98046875, | |
| "step": 365 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 91.5859375, | |
| "epoch": 0.6918714555765595, | |
| "grad_norm": 2.5809981268174518, | |
| "kl": 0.0478515625, | |
| "learning_rate": 4.3306366776109616e-07, | |
| "loss": 0.0019, | |
| "reward": 1.5660797357559204, | |
| "reward_std": 0.1301419585943222, | |
| "rewards/accuracy_reward": 0.5660797357559204, | |
| "rewards/format_reward": 1.0, | |
| "step": 366 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.09375, | |
| "completion_length": 92.61328125, | |
| "epoch": 0.6937618147448015, | |
| "grad_norm": 2.395698693198084, | |
| "kl": 0.047119140625, | |
| "learning_rate": 4.2818158923115244e-07, | |
| "loss": 0.0019, | |
| "reward": 1.6860473155975342, | |
| "reward_std": 0.17321570217609406, | |
| "rewards/accuracy_reward": 0.6860473155975342, | |
| "rewards/format_reward": 1.0, | |
| "step": 367 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 93.4375, | |
| "epoch": 0.6956521739130435, | |
| "grad_norm": 2.175508616602608, | |
| "kl": 0.044677734375, | |
| "learning_rate": 4.233196778851329e-07, | |
| "loss": 0.0018, | |
| "reward": 1.5961754322052002, | |
| "reward_std": 0.15679945051670074, | |
| "rewards/accuracy_reward": 0.5961754322052002, | |
| "rewards/format_reward": 1.0, | |
| "step": 368 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 100.703125, | |
| "epoch": 0.6975425330812854, | |
| "grad_norm": 2.3623237744377685, | |
| "kl": 0.043212890625, | |
| "learning_rate": 4.184781051954125e-07, | |
| "loss": 0.0017, | |
| "reward": 1.6343427896499634, | |
| "reward_std": 0.1671934723854065, | |
| "rewards/accuracy_reward": 0.6343427896499634, | |
| "rewards/format_reward": 1.0, | |
| "step": 369 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.09375, | |
| "completion_length": 97.34765625, | |
| "epoch": 0.6994328922495274, | |
| "grad_norm": 1.8342117768253567, | |
| "kl": 0.0419921875, | |
| "learning_rate": 4.136570419170501e-07, | |
| "loss": 0.0017, | |
| "reward": 1.5979249477386475, | |
| "reward_std": 0.2753554880619049, | |
| "rewards/accuracy_reward": 0.6135499477386475, | |
| "rewards/format_reward": 0.984375, | |
| "step": 370 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.15625, | |
| "completion_length": 96.44140625, | |
| "epoch": 0.7013232514177694, | |
| "grad_norm": 1.8499863828085692, | |
| "kl": 0.04345703125, | |
| "learning_rate": 4.088566580817694e-07, | |
| "loss": 0.0017, | |
| "reward": 1.5595123767852783, | |
| "reward_std": 0.18973296880722046, | |
| "rewards/accuracy_reward": 0.5634186267852783, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 371 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.125, | |
| "completion_length": 99.48828125, | |
| "epoch": 0.7032136105860114, | |
| "grad_norm": 2.208899799066235, | |
| "kl": 0.04541015625, | |
| "learning_rate": 4.040771229919612e-07, | |
| "loss": 0.0018, | |
| "reward": 1.6975526809692383, | |
| "reward_std": 0.17384248971939087, | |
| "rewards/accuracy_reward": 0.7053651809692383, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 372 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.21875, | |
| "completion_length": 87.18359375, | |
| "epoch": 0.7051039697542533, | |
| "grad_norm": 1.8987142352577209, | |
| "kl": 0.04638671875, | |
| "learning_rate": 3.9931860521471097e-07, | |
| "loss": 0.0019, | |
| "reward": 1.5160590410232544, | |
| "reward_std": 0.21702983975410461, | |
| "rewards/accuracy_reward": 0.5160590410232544, | |
| "rewards/format_reward": 1.0, | |
| "step": 373 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.09375, | |
| "completion_length": 92.609375, | |
| "epoch": 0.7069943289224953, | |
| "grad_norm": 15.04217885199878, | |
| "kl": 0.046142578125, | |
| "learning_rate": 3.945812725758554e-07, | |
| "loss": 0.0018, | |
| "reward": 1.7074790000915527, | |
| "reward_std": 0.19710449874401093, | |
| "rewards/accuracy_reward": 0.7113852500915527, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 374 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 92.65625, | |
| "epoch": 0.7088846880907372, | |
| "grad_norm": 1.91152067931555, | |
| "kl": 0.043701171875, | |
| "learning_rate": 3.898652921540627e-07, | |
| "loss": 0.0017, | |
| "reward": 1.549987554550171, | |
| "reward_std": 0.15819934010505676, | |
| "rewards/accuracy_reward": 0.5617063641548157, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 375 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.28125, | |
| "completion_length": 89.109375, | |
| "epoch": 0.7107750472589792, | |
| "grad_norm": 1.585271716791016, | |
| "kl": 0.052001953125, | |
| "learning_rate": 3.851708302749409e-07, | |
| "loss": 0.0021, | |
| "reward": 1.4718488454818726, | |
| "reward_std": 0.15529434382915497, | |
| "rewards/accuracy_reward": 0.47184884548187256, | |
| "rewards/format_reward": 1.0, | |
| "step": 376 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.21875, | |
| "completion_length": 91.43359375, | |
| "epoch": 0.7126654064272212, | |
| "grad_norm": 1.6292096929321715, | |
| "kl": 0.039794921875, | |
| "learning_rate": 3.8049805250517e-07, | |
| "loss": 0.0016, | |
| "reward": 1.5054469108581543, | |
| "reward_std": 0.1916845291852951, | |
| "rewards/accuracy_reward": 0.5054467916488647, | |
| "rewards/format_reward": 1.0, | |
| "step": 377 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.09375, | |
| "completion_length": 87.43359375, | |
| "epoch": 0.7145557655954632, | |
| "grad_norm": 2.5257432847150865, | |
| "kl": 0.045654296875, | |
| "learning_rate": 3.7584712364666493e-07, | |
| "loss": 0.0018, | |
| "reward": 1.5939102172851562, | |
| "reward_std": 0.24795284867286682, | |
| "rewards/accuracy_reward": 0.5939102172851562, | |
| "rewards/format_reward": 1.0, | |
| "step": 378 | |
| }, | |
| { | |
| "all_correct": 0.1875, | |
| "all_wrong": 0.09375, | |
| "completion_length": 94.90625, | |
| "epoch": 0.7164461247637051, | |
| "grad_norm": 2.7673062583960317, | |
| "kl": 0.045166015625, | |
| "learning_rate": 3.7121820773076097e-07, | |
| "loss": 0.0018, | |
| "reward": 1.5857834815979004, | |
| "reward_std": 0.25405406951904297, | |
| "rewards/accuracy_reward": 0.5857834219932556, | |
| "rewards/format_reward": 1.0, | |
| "step": 379 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.09375, | |
| "completion_length": 90.87109375, | |
| "epoch": 0.718336483931947, | |
| "grad_norm": 2.2822899165231294, | |
| "kl": 0.047119140625, | |
| "learning_rate": 3.666114680124298e-07, | |
| "loss": 0.0019, | |
| "reward": 1.5186080932617188, | |
| "reward_std": 0.1982262283563614, | |
| "rewards/accuracy_reward": 0.5186082124710083, | |
| "rewards/format_reward": 1.0, | |
| "step": 380 | |
| }, | |
| { | |
| "all_correct": 0.21875, | |
| "all_wrong": 0.21875, | |
| "completion_length": 87.87890625, | |
| "epoch": 0.720226843100189, | |
| "grad_norm": 2.6361171968305346, | |
| "kl": 0.047607421875, | |
| "learning_rate": 3.620270669645228e-07, | |
| "loss": 0.0019, | |
| "reward": 1.43359375, | |
| "reward_std": 0.2574925422668457, | |
| "rewards/accuracy_reward": 0.43359375, | |
| "rewards/format_reward": 1.0, | |
| "step": 381 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.1875, | |
| "completion_length": 85.53515625, | |
| "epoch": 0.722117202268431, | |
| "grad_norm": 1.934475962789802, | |
| "kl": 0.04736328125, | |
| "learning_rate": 3.5746516627203816e-07, | |
| "loss": 0.0019, | |
| "reward": 1.5397059917449951, | |
| "reward_std": 0.2019021213054657, | |
| "rewards/accuracy_reward": 0.5397060513496399, | |
| "rewards/format_reward": 1.0, | |
| "step": 382 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 89.05859375, | |
| "epoch": 0.724007561436673, | |
| "grad_norm": 1.7765413930033112, | |
| "kl": 0.04248046875, | |
| "learning_rate": 3.529259268264213e-07, | |
| "loss": 0.0017, | |
| "reward": 1.6433665752410889, | |
| "reward_std": 0.1822570264339447, | |
| "rewards/accuracy_reward": 0.6433665752410889, | |
| "rewards/format_reward": 1.0, | |
| "step": 383 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 90.70703125, | |
| "epoch": 0.725897920604915, | |
| "grad_norm": 3.235569329484398, | |
| "kl": 0.04296875, | |
| "learning_rate": 3.4840950871988806e-07, | |
| "loss": 0.0017, | |
| "reward": 1.6414120197296143, | |
| "reward_std": 0.1383558064699173, | |
| "rewards/accuracy_reward": 0.6414120197296143, | |
| "rewards/format_reward": 1.0, | |
| "step": 384 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 84.59765625, | |
| "epoch": 0.7277882797731569, | |
| "grad_norm": 2.5415509345709726, | |
| "kl": 0.052978515625, | |
| "learning_rate": 3.4391607123978096e-07, | |
| "loss": 0.0021, | |
| "reward": 1.6038849353790283, | |
| "reward_std": 0.14968551695346832, | |
| "rewards/accuracy_reward": 0.6038850545883179, | |
| "rewards/format_reward": 1.0, | |
| "step": 385 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.1875, | |
| "completion_length": 92.71484375, | |
| "epoch": 0.7296786389413988, | |
| "grad_norm": 1.9594609018418163, | |
| "kl": 0.04736328125, | |
| "learning_rate": 3.3944577286294886e-07, | |
| "loss": 0.0019, | |
| "reward": 1.5191731452941895, | |
| "reward_std": 0.21669438481330872, | |
| "rewards/accuracy_reward": 0.5308918952941895, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 386 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.21875, | |
| "completion_length": 90.5, | |
| "epoch": 0.7315689981096408, | |
| "grad_norm": 1.5282132767893477, | |
| "kl": 0.044189453125, | |
| "learning_rate": 3.3499877125015907e-07, | |
| "loss": 0.0018, | |
| "reward": 1.5483942031860352, | |
| "reward_std": 0.1809859573841095, | |
| "rewards/accuracy_reward": 0.5523004531860352, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 387 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 88.44921875, | |
| "epoch": 0.7334593572778828, | |
| "grad_norm": 1.511915251166267, | |
| "kl": 0.04443359375, | |
| "learning_rate": 3.305752232405377e-07, | |
| "loss": 0.0018, | |
| "reward": 1.5602011680603027, | |
| "reward_std": 0.1667328178882599, | |
| "rewards/accuracy_reward": 0.560201108455658, | |
| "rewards/format_reward": 1.0, | |
| "step": 388 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.125, | |
| "completion_length": 87.83203125, | |
| "epoch": 0.7353497164461248, | |
| "grad_norm": 2.450602394222447, | |
| "kl": 0.0419921875, | |
| "learning_rate": 3.2617528484603574e-07, | |
| "loss": 0.0017, | |
| "reward": 1.6159805059432983, | |
| "reward_std": 0.22847887873649597, | |
| "rewards/accuracy_reward": 0.6159805059432983, | |
| "rewards/format_reward": 1.0, | |
| "step": 389 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.125, | |
| "completion_length": 93.2890625, | |
| "epoch": 0.7372400756143668, | |
| "grad_norm": 1.5610935075912176, | |
| "kl": 0.043212890625, | |
| "learning_rate": 3.217991112459296e-07, | |
| "loss": 0.0017, | |
| "reward": 1.614638328552246, | |
| "reward_std": 0.19237719476222992, | |
| "rewards/accuracy_reward": 0.6146383285522461, | |
| "rewards/format_reward": 1.0, | |
| "step": 390 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.09375, | |
| "completion_length": 89.69921875, | |
| "epoch": 0.7391304347826086, | |
| "grad_norm": 1.6750006945933873, | |
| "kl": 0.041259765625, | |
| "learning_rate": 3.174468567813461e-07, | |
| "loss": 0.0017, | |
| "reward": 1.703397274017334, | |
| "reward_std": 0.19982343912124634, | |
| "rewards/accuracy_reward": 0.703397274017334, | |
| "rewards/format_reward": 1.0, | |
| "step": 391 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.1875, | |
| "completion_length": 88.98046875, | |
| "epoch": 0.7410207939508506, | |
| "grad_norm": 1.5050532217987163, | |
| "kl": 0.040771484375, | |
| "learning_rate": 3.131186749498195e-07, | |
| "loss": 0.0016, | |
| "reward": 1.6249730587005615, | |
| "reward_std": 0.1348705291748047, | |
| "rewards/accuracy_reward": 0.6249730587005615, | |
| "rewards/format_reward": 1.0, | |
| "step": 392 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.1875, | |
| "completion_length": 86.80078125, | |
| "epoch": 0.7429111531190926, | |
| "grad_norm": 2.2056728949358497, | |
| "kl": 0.047607421875, | |
| "learning_rate": 3.0881471839987815e-07, | |
| "loss": 0.0019, | |
| "reward": 1.6306958198547363, | |
| "reward_std": 0.15101388096809387, | |
| "rewards/accuracy_reward": 0.6306958198547363, | |
| "rewards/format_reward": 1.0, | |
| "step": 393 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.21875, | |
| "completion_length": 91.85546875, | |
| "epoch": 0.7448015122873346, | |
| "grad_norm": 2.1315080678559024, | |
| "kl": 0.044921875, | |
| "learning_rate": 3.0453513892566195e-07, | |
| "loss": 0.0018, | |
| "reward": 1.544640302658081, | |
| "reward_std": 0.19710536301136017, | |
| "rewards/accuracy_reward": 0.5446402430534363, | |
| "rewards/format_reward": 1.0, | |
| "step": 394 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.1875, | |
| "completion_length": 85.7109375, | |
| "epoch": 0.7466918714555766, | |
| "grad_norm": 2.06732415423423, | |
| "kl": 0.04931640625, | |
| "learning_rate": 3.0028008746156587e-07, | |
| "loss": 0.002, | |
| "reward": 1.5586893558502197, | |
| "reward_std": 0.15845312178134918, | |
| "rewards/accuracy_reward": 0.5586893558502197, | |
| "rewards/format_reward": 1.0, | |
| "step": 395 | |
| }, | |
| { | |
| "all_correct": 0.5, | |
| "all_wrong": 0.09375, | |
| "completion_length": 93.92578125, | |
| "epoch": 0.7485822306238186, | |
| "grad_norm": 1.8815000905698593, | |
| "kl": 0.046630859375, | |
| "learning_rate": 2.9604971407692026e-07, | |
| "loss": 0.0019, | |
| "reward": 1.697596788406372, | |
| "reward_std": 0.13653597235679626, | |
| "rewards/accuracy_reward": 0.6975967288017273, | |
| "rewards/format_reward": 1.0, | |
| "step": 396 | |
| }, | |
| { | |
| "all_correct": 0.21875, | |
| "all_wrong": 0.375, | |
| "completion_length": 93.3359375, | |
| "epoch": 0.7504725897920604, | |
| "grad_norm": 1.5391602816018224, | |
| "kl": 0.041015625, | |
| "learning_rate": 2.918441679706949e-07, | |
| "loss": 0.0016, | |
| "reward": 1.386269211769104, | |
| "reward_std": 0.18272624909877777, | |
| "rewards/accuracy_reward": 0.394081711769104, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 397 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.28125, | |
| "completion_length": 90.81640625, | |
| "epoch": 0.7523629489603024, | |
| "grad_norm": 1.1599213261291745, | |
| "kl": 0.043701171875, | |
| "learning_rate": 2.876635974662389e-07, | |
| "loss": 0.0017, | |
| "reward": 1.473933458328247, | |
| "reward_std": 0.16899192333221436, | |
| "rewards/accuracy_reward": 0.47783973813056946, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 398 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.21875, | |
| "completion_length": 90.78515625, | |
| "epoch": 0.7542533081285444, | |
| "grad_norm": 1.8839648182094828, | |
| "kl": 0.041748046875, | |
| "learning_rate": 2.8350815000604976e-07, | |
| "loss": 0.0017, | |
| "reward": 1.55859375, | |
| "reward_std": 0.23474985361099243, | |
| "rewards/accuracy_reward": 0.5703125, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 399 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 93.2578125, | |
| "epoch": 0.7561436672967864, | |
| "grad_norm": 1.4293529083675727, | |
| "kl": 0.04541015625, | |
| "learning_rate": 2.7937797214657143e-07, | |
| "loss": 0.0018, | |
| "reward": 1.5872396230697632, | |
| "reward_std": 0.1798129379749298, | |
| "rewards/accuracy_reward": 0.6106771230697632, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 400 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.25, | |
| "completion_length": 86.91796875, | |
| "epoch": 0.7580340264650284, | |
| "grad_norm": 2.026608250956968, | |
| "kl": 0.04541015625, | |
| "learning_rate": 2.752732095530279e-07, | |
| "loss": 0.0018, | |
| "reward": 1.4875531196594238, | |
| "reward_std": 0.18699489533901215, | |
| "rewards/accuracy_reward": 0.48755308985710144, | |
| "rewards/format_reward": 1.0, | |
| "step": 401 | |
| }, | |
| { | |
| "all_correct": 0.53125, | |
| "all_wrong": 0.15625, | |
| "completion_length": 99.1640625, | |
| "epoch": 0.7599243856332704, | |
| "grad_norm": 1.5439354460633543, | |
| "kl": 0.04443359375, | |
| "learning_rate": 2.711940069942833e-07, | |
| "loss": 0.0018, | |
| "reward": 1.6947214603424072, | |
| "reward_std": 0.1390814185142517, | |
| "rewards/accuracy_reward": 0.7181590795516968, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 402 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.09375, | |
| "completion_length": 96.125, | |
| "epoch": 0.7618147448015122, | |
| "grad_norm": 1.9024078180465291, | |
| "kl": 0.039794921875, | |
| "learning_rate": 2.671405083377386e-07, | |
| "loss": 0.0016, | |
| "reward": 1.6169142723083496, | |
| "reward_std": 0.19913235306739807, | |
| "rewards/accuracy_reward": 0.6169142723083496, | |
| "rewards/format_reward": 1.0, | |
| "step": 403 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.09375, | |
| "completion_length": 94.4921875, | |
| "epoch": 0.7637051039697542, | |
| "grad_norm": 2.8962041644930108, | |
| "kl": 0.042724609375, | |
| "learning_rate": 2.6311285654425574e-07, | |
| "loss": 0.0017, | |
| "reward": 1.6525933742523193, | |
| "reward_std": 0.25046759843826294, | |
| "rewards/accuracy_reward": 0.6525933146476746, | |
| "rewards/format_reward": 1.0, | |
| "step": 404 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.25, | |
| "completion_length": 87.58203125, | |
| "epoch": 0.7655954631379962, | |
| "grad_norm": 1.4168894497370028, | |
| "kl": 0.0419921875, | |
| "learning_rate": 2.59111193663116e-07, | |
| "loss": 0.0017, | |
| "reward": 1.505859375, | |
| "reward_std": 0.17382082343101501, | |
| "rewards/accuracy_reward": 0.505859375, | |
| "rewards/format_reward": 1.0, | |
| "step": 405 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.15625, | |
| "completion_length": 90.953125, | |
| "epoch": 0.7674858223062382, | |
| "grad_norm": 2.400477941677905, | |
| "kl": 0.044677734375, | |
| "learning_rate": 2.5513566082701134e-07, | |
| "loss": 0.0018, | |
| "reward": 1.6569660902023315, | |
| "reward_std": 0.14563217759132385, | |
| "rewards/accuracy_reward": 0.6569661498069763, | |
| "rewards/format_reward": 1.0, | |
| "step": 406 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.1875, | |
| "completion_length": 94.703125, | |
| "epoch": 0.7693761814744802, | |
| "grad_norm": 1.2607140346650068, | |
| "kl": 0.03515625, | |
| "learning_rate": 2.51186398247065e-07, | |
| "loss": 0.0014, | |
| "reward": 1.6192121505737305, | |
| "reward_std": 0.11734330654144287, | |
| "rewards/accuracy_reward": 0.6192121505737305, | |
| "rewards/format_reward": 1.0, | |
| "step": 407 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.1875, | |
| "completion_length": 91.6015625, | |
| "epoch": 0.7712665406427222, | |
| "grad_norm": 3.7794955364804887, | |
| "kl": 0.044677734375, | |
| "learning_rate": 2.472635452078883e-07, | |
| "loss": 0.0018, | |
| "reward": 1.5780255794525146, | |
| "reward_std": 0.1755901575088501, | |
| "rewards/accuracy_reward": 0.5780255794525146, | |
| "rewards/format_reward": 1.0, | |
| "step": 408 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.25, | |
| "completion_length": 89.140625, | |
| "epoch": 0.7731568998109641, | |
| "grad_norm": 1.6217685852043846, | |
| "kl": 0.0400390625, | |
| "learning_rate": 2.433672400626663e-07, | |
| "loss": 0.0016, | |
| "reward": 1.6167256832122803, | |
| "reward_std": 0.1386527717113495, | |
| "rewards/accuracy_reward": 0.616725742816925, | |
| "rewards/format_reward": 1.0, | |
| "step": 409 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.21875, | |
| "completion_length": 88.8203125, | |
| "epoch": 0.775047258979206, | |
| "grad_norm": 2.028364703410531, | |
| "kl": 0.051025390625, | |
| "learning_rate": 2.3949762022828093e-07, | |
| "loss": 0.002, | |
| "reward": 1.5439236164093018, | |
| "reward_std": 0.22322307527065277, | |
| "rewards/accuracy_reward": 0.5439236164093018, | |
| "rewards/format_reward": 1.0, | |
| "step": 410 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.1875, | |
| "completion_length": 92.33203125, | |
| "epoch": 0.776937618147448, | |
| "grad_norm": 2.2480672974329035, | |
| "kl": 0.04931640625, | |
| "learning_rate": 2.3565482218046073e-07, | |
| "loss": 0.002, | |
| "reward": 1.567735195159912, | |
| "reward_std": 0.2084602564573288, | |
| "rewards/accuracy_reward": 0.5677351355552673, | |
| "rewards/format_reward": 1.0, | |
| "step": 411 | |
| }, | |
| { | |
| "all_correct": 0.5625, | |
| "all_wrong": 0.0625, | |
| "completion_length": 94.765625, | |
| "epoch": 0.77882797731569, | |
| "grad_norm": 1.8397495365435537, | |
| "kl": 0.04248046875, | |
| "learning_rate": 2.3183898144897175e-07, | |
| "loss": 0.0017, | |
| "reward": 1.7432655096054077, | |
| "reward_std": 0.177871972322464, | |
| "rewards/accuracy_reward": 0.7432655096054077, | |
| "rewards/format_reward": 1.0, | |
| "step": 412 | |
| }, | |
| { | |
| "all_correct": 0.1875, | |
| "all_wrong": 0.25, | |
| "completion_length": 90.35546875, | |
| "epoch": 0.780718336483932, | |
| "grad_norm": 2.4627560536575497, | |
| "kl": 0.0458984375, | |
| "learning_rate": 2.2805023261283496e-07, | |
| "loss": 0.0018, | |
| "reward": 1.4680068492889404, | |
| "reward_std": 0.2125786542892456, | |
| "rewards/accuracy_reward": 0.46800681948661804, | |
| "rewards/format_reward": 1.0, | |
| "step": 413 | |
| }, | |
| { | |
| "all_correct": 0.15625, | |
| "all_wrong": 0.21875, | |
| "completion_length": 96.5859375, | |
| "epoch": 0.782608695652174, | |
| "grad_norm": 21.851694077321632, | |
| "kl": 0.042724609375, | |
| "learning_rate": 2.2428870929558007e-07, | |
| "loss": 0.0017, | |
| "reward": 1.4453372955322266, | |
| "reward_std": 0.26194411516189575, | |
| "rewards/accuracy_reward": 0.4453372359275818, | |
| "rewards/format_reward": 1.0, | |
| "step": 414 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.3125, | |
| "completion_length": 95.6015625, | |
| "epoch": 0.7844990548204159, | |
| "grad_norm": 2.1610667049451417, | |
| "kl": 0.04833984375, | |
| "learning_rate": 2.205545441605342e-07, | |
| "loss": 0.0019, | |
| "reward": 1.4564586877822876, | |
| "reward_std": 0.1753809005022049, | |
| "rewards/accuracy_reward": 0.46427121758461, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 415 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 91.46875, | |
| "epoch": 0.7863894139886578, | |
| "grad_norm": 10.481690705107473, | |
| "kl": 0.04443359375, | |
| "learning_rate": 2.1684786890614127e-07, | |
| "loss": 0.0018, | |
| "reward": 1.5876367092132568, | |
| "reward_std": 0.17977751791477203, | |
| "rewards/accuracy_reward": 0.5876367092132568, | |
| "rewards/format_reward": 1.0, | |
| "step": 416 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.125, | |
| "completion_length": 94.65625, | |
| "epoch": 0.7882797731568998, | |
| "grad_norm": 2.8247468950858647, | |
| "kl": 0.039306640625, | |
| "learning_rate": 2.1316881426131827e-07, | |
| "loss": 0.0016, | |
| "reward": 1.6735260486602783, | |
| "reward_std": 0.17629718780517578, | |
| "rewards/accuracy_reward": 0.6735259890556335, | |
| "rewards/format_reward": 1.0, | |
| "step": 417 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.125, | |
| "completion_length": 95.9765625, | |
| "epoch": 0.7901701323251418, | |
| "grad_norm": 1.9238663712431303, | |
| "kl": 0.041259765625, | |
| "learning_rate": 2.0951750998084438e-07, | |
| "loss": 0.0016, | |
| "reward": 1.5885775089263916, | |
| "reward_std": 0.2954632043838501, | |
| "rewards/accuracy_reward": 0.6002963781356812, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 418 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.1875, | |
| "completion_length": 92.58203125, | |
| "epoch": 0.7920604914933838, | |
| "grad_norm": 2.082634838675884, | |
| "kl": 0.04296875, | |
| "learning_rate": 2.058940848407854e-07, | |
| "loss": 0.0017, | |
| "reward": 1.5185022354125977, | |
| "reward_std": 0.21989545226097107, | |
| "rewards/accuracy_reward": 0.5185022950172424, | |
| "rewards/format_reward": 1.0, | |
| "step": 419 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 95.96484375, | |
| "epoch": 0.7939508506616257, | |
| "grad_norm": 1.4637336629082067, | |
| "kl": 0.038818359375, | |
| "learning_rate": 2.0229866663395023e-07, | |
| "loss": 0.0016, | |
| "reward": 1.5763494968414307, | |
| "reward_std": 0.22779253125190735, | |
| "rewards/accuracy_reward": 0.5919744372367859, | |
| "rewards/format_reward": 0.984375, | |
| "step": 420 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 94.42578125, | |
| "epoch": 0.7958412098298677, | |
| "grad_norm": 1.3544480164552257, | |
| "kl": 0.0419921875, | |
| "learning_rate": 1.9873138216538609e-07, | |
| "loss": 0.0017, | |
| "reward": 1.6315104961395264, | |
| "reward_std": 0.15740279853343964, | |
| "rewards/accuracy_reward": 0.6354166865348816, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 421 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.28125, | |
| "completion_length": 91.765625, | |
| "epoch": 0.7977315689981096, | |
| "grad_norm": 9.201439231037005, | |
| "kl": 0.042724609375, | |
| "learning_rate": 1.951923572479044e-07, | |
| "loss": 0.0017, | |
| "reward": 1.479612112045288, | |
| "reward_std": 0.19981667399406433, | |
| "rewards/accuracy_reward": 0.4796121120452881, | |
| "rewards/format_reward": 1.0, | |
| "step": 422 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.1875, | |
| "completion_length": 94.84765625, | |
| "epoch": 0.7996219281663516, | |
| "grad_norm": 1.4237426067896413, | |
| "kl": 0.048583984375, | |
| "learning_rate": 1.916817166976441e-07, | |
| "loss": 0.0019, | |
| "reward": 1.5277478694915771, | |
| "reward_std": 0.2083957940340042, | |
| "rewards/accuracy_reward": 0.5355602502822876, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 423 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.1875, | |
| "completion_length": 91.10546875, | |
| "epoch": 0.8015122873345936, | |
| "grad_norm": 2.293182503990981, | |
| "kl": 0.041259765625, | |
| "learning_rate": 1.8819958432967076e-07, | |
| "loss": 0.0017, | |
| "reward": 1.558853268623352, | |
| "reward_std": 0.18910501897335052, | |
| "rewards/accuracy_reward": 0.562759518623352, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 424 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.09375, | |
| "completion_length": 95.41015625, | |
| "epoch": 0.8034026465028355, | |
| "grad_norm": 1.5366707245335558, | |
| "kl": 0.0419921875, | |
| "learning_rate": 1.847460829536075e-07, | |
| "loss": 0.0017, | |
| "reward": 1.6059564352035522, | |
| "reward_std": 0.2242327332496643, | |
| "rewards/accuracy_reward": 0.6215814352035522, | |
| "rewards/format_reward": 0.984375, | |
| "step": 425 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 90.1484375, | |
| "epoch": 0.8052930056710775, | |
| "grad_norm": 1.3312039879338635, | |
| "kl": 0.041748046875, | |
| "learning_rate": 1.813213343693064e-07, | |
| "loss": 0.0017, | |
| "reward": 1.5957437753677368, | |
| "reward_std": 0.16802389919757843, | |
| "rewards/accuracy_reward": 0.595743715763092, | |
| "rewards/format_reward": 1.0, | |
| "step": 426 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.09375, | |
| "completion_length": 97.79296875, | |
| "epoch": 0.8071833648393195, | |
| "grad_norm": 2.9517888821154714, | |
| "kl": 0.04248046875, | |
| "learning_rate": 1.779254593625501e-07, | |
| "loss": 0.0017, | |
| "reward": 1.6021231412887573, | |
| "reward_std": 0.22704648971557617, | |
| "rewards/accuracy_reward": 0.6021231412887573, | |
| "rewards/format_reward": 1.0, | |
| "step": 427 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.25, | |
| "completion_length": 92.49609375, | |
| "epoch": 0.8090737240075614, | |
| "grad_norm": 1.741276036032744, | |
| "kl": 0.048095703125, | |
| "learning_rate": 1.745585777007943e-07, | |
| "loss": 0.0019, | |
| "reward": 1.5888817310333252, | |
| "reward_std": 0.09943927079439163, | |
| "rewards/accuracy_reward": 0.5888816714286804, | |
| "rewards/format_reward": 1.0, | |
| "step": 428 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.09375, | |
| "completion_length": 95.453125, | |
| "epoch": 0.8109640831758034, | |
| "grad_norm": 1.5867733017615009, | |
| "kl": 0.03955078125, | |
| "learning_rate": 1.7122080812894146e-07, | |
| "loss": 0.0016, | |
| "reward": 1.6287798881530762, | |
| "reward_std": 0.19112670421600342, | |
| "rewards/accuracy_reward": 0.6287798881530762, | |
| "rewards/format_reward": 1.0, | |
| "step": 429 | |
| }, | |
| { | |
| "all_correct": 0.21875, | |
| "all_wrong": 0.15625, | |
| "completion_length": 97.41796875, | |
| "epoch": 0.8128544423440454, | |
| "grad_norm": 1.8869110513185203, | |
| "kl": 0.04296875, | |
| "learning_rate": 1.679122683651546e-07, | |
| "loss": 0.0017, | |
| "reward": 1.5692415237426758, | |
| "reward_std": 0.24745473265647888, | |
| "rewards/accuracy_reward": 0.569241464138031, | |
| "rewards/format_reward": 1.0, | |
| "step": 430 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.15625, | |
| "completion_length": 96.97265625, | |
| "epoch": 0.8147448015122873, | |
| "grad_norm": 2.0763617955307274, | |
| "kl": 0.0478515625, | |
| "learning_rate": 1.6463307509670522e-07, | |
| "loss": 0.0019, | |
| "reward": 1.5514062643051147, | |
| "reward_std": 0.21106550097465515, | |
| "rewards/accuracy_reward": 0.5514062643051147, | |
| "rewards/format_reward": 1.0, | |
| "step": 431 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 96.08984375, | |
| "epoch": 0.8166351606805293, | |
| "grad_norm": 4.650342860685201, | |
| "kl": 0.044189453125, | |
| "learning_rate": 1.6138334397585674e-07, | |
| "loss": 0.0018, | |
| "reward": 1.5383667945861816, | |
| "reward_std": 0.166859969496727, | |
| "rewards/accuracy_reward": 0.5500854849815369, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 432 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.25, | |
| "completion_length": 90.95703125, | |
| "epoch": 0.8185255198487713, | |
| "grad_norm": 1.7529564775308286, | |
| "kl": 0.04150390625, | |
| "learning_rate": 1.5816318961578756e-07, | |
| "loss": 0.0017, | |
| "reward": 1.4878764152526855, | |
| "reward_std": 0.18354278802871704, | |
| "rewards/accuracy_reward": 0.48787635564804077, | |
| "rewards/format_reward": 1.0, | |
| "step": 433 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.21875, | |
| "completion_length": 85.59375, | |
| "epoch": 0.8204158790170132, | |
| "grad_norm": 3.6359061398641677, | |
| "kl": 0.048095703125, | |
| "learning_rate": 1.5497272558654695e-07, | |
| "loss": 0.0019, | |
| "reward": 1.4656922817230225, | |
| "reward_std": 0.2116546779870987, | |
| "rewards/accuracy_reward": 0.46959853172302246, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 434 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.09375, | |
| "completion_length": 95.8515625, | |
| "epoch": 0.8223062381852552, | |
| "grad_norm": 4.974875276682775, | |
| "kl": 0.046142578125, | |
| "learning_rate": 1.5181206441105077e-07, | |
| "loss": 0.0018, | |
| "reward": 1.6275532245635986, | |
| "reward_std": 0.20301690697669983, | |
| "rewards/accuracy_reward": 0.6314594745635986, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 435 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.125, | |
| "completion_length": 90.0625, | |
| "epoch": 0.8241965973534972, | |
| "grad_norm": 2.449754366709296, | |
| "kl": 0.04296875, | |
| "learning_rate": 1.4868131756111223e-07, | |
| "loss": 0.0017, | |
| "reward": 1.5798760652542114, | |
| "reward_std": 0.2174547016620636, | |
| "rewards/accuracy_reward": 0.5994073152542114, | |
| "rewards/format_reward": 0.98046875, | |
| "step": 436 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.125, | |
| "completion_length": 99.25390625, | |
| "epoch": 0.8260869565217391, | |
| "grad_norm": 1.4549018063192884, | |
| "kl": 0.047607421875, | |
| "learning_rate": 1.4558059545351142e-07, | |
| "loss": 0.0019, | |
| "reward": 1.5477688312530518, | |
| "reward_std": 0.2062843143939972, | |
| "rewards/accuracy_reward": 0.5673000812530518, | |
| "rewards/format_reward": 0.98046875, | |
| "step": 437 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.1875, | |
| "completion_length": 91.74609375, | |
| "epoch": 0.8279773156899811, | |
| "grad_norm": 1.2525640014161112, | |
| "kl": 0.04248046875, | |
| "learning_rate": 1.425100074461003e-07, | |
| "loss": 0.0017, | |
| "reward": 1.6436383724212646, | |
| "reward_std": 0.1697710007429123, | |
| "rewards/accuracy_reward": 0.6436384320259094, | |
| "rewards/format_reward": 1.0, | |
| "step": 438 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.09375, | |
| "completion_length": 93.125, | |
| "epoch": 0.8298676748582231, | |
| "grad_norm": 2.3121952823917042, | |
| "kl": 0.040283203125, | |
| "learning_rate": 1.394696618339456e-07, | |
| "loss": 0.0016, | |
| "reward": 1.6626487970352173, | |
| "reward_std": 0.174302339553833, | |
| "rewards/accuracy_reward": 0.6626487970352173, | |
| "rewards/format_reward": 1.0, | |
| "step": 439 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.34375, | |
| "completion_length": 93.94140625, | |
| "epoch": 0.831758034026465, | |
| "grad_norm": 1.3605971037488336, | |
| "kl": 0.045166015625, | |
| "learning_rate": 1.364596658455105e-07, | |
| "loss": 0.0018, | |
| "reward": 1.5049912929534912, | |
| "reward_std": 0.12267406284809113, | |
| "rewards/accuracy_reward": 0.504991352558136, | |
| "rewards/format_reward": 1.0, | |
| "step": 440 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.1875, | |
| "completion_length": 91.05859375, | |
| "epoch": 0.833648393194707, | |
| "grad_norm": 1.4329708155173375, | |
| "kl": 0.043701171875, | |
| "learning_rate": 1.33480125638871e-07, | |
| "loss": 0.0017, | |
| "reward": 1.6171175241470337, | |
| "reward_std": 0.1537449210882187, | |
| "rewards/accuracy_reward": 0.6210237741470337, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 441 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 88.9765625, | |
| "epoch": 0.8355387523629489, | |
| "grad_norm": 17.688886780647458, | |
| "kl": 0.0400390625, | |
| "learning_rate": 1.3053114629797435e-07, | |
| "loss": 0.0016, | |
| "reward": 1.5753612518310547, | |
| "reward_std": 0.18671679496765137, | |
| "rewards/accuracy_reward": 0.5753612518310547, | |
| "rewards/format_reward": 1.0, | |
| "step": 442 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 98.7578125, | |
| "epoch": 0.8374291115311909, | |
| "grad_norm": 1.9625832093816682, | |
| "kl": 0.045654296875, | |
| "learning_rate": 1.2761283182893047e-07, | |
| "loss": 0.0018, | |
| "reward": 1.5934289693832397, | |
| "reward_std": 0.21815939247608185, | |
| "rewards/accuracy_reward": 0.5934289693832397, | |
| "rewards/format_reward": 1.0, | |
| "step": 443 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.125, | |
| "completion_length": 96.1328125, | |
| "epoch": 0.8393194706994329, | |
| "grad_norm": 1.580613447705244, | |
| "kl": 0.039794921875, | |
| "learning_rate": 1.2472528515634585e-07, | |
| "loss": 0.0016, | |
| "reward": 1.6483333110809326, | |
| "reward_std": 0.19740188121795654, | |
| "rewards/accuracy_reward": 0.6483333110809326, | |
| "rewards/format_reward": 1.0, | |
| "step": 444 | |
| }, | |
| { | |
| "all_correct": 0.53125, | |
| "all_wrong": 0.1875, | |
| "completion_length": 94.78125, | |
| "epoch": 0.8412098298676749, | |
| "grad_norm": 1.1020475480569663, | |
| "kl": 0.03955078125, | |
| "learning_rate": 1.2186860811969168e-07, | |
| "loss": 0.0016, | |
| "reward": 1.702857494354248, | |
| "reward_std": 0.13044710457324982, | |
| "rewards/accuracy_reward": 0.7028576135635376, | |
| "rewards/format_reward": 1.0, | |
| "step": 445 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.125, | |
| "completion_length": 94.3515625, | |
| "epoch": 0.8431001890359168, | |
| "grad_norm": 3.1193628774228603, | |
| "kl": 0.048583984375, | |
| "learning_rate": 1.1904290146971397e-07, | |
| "loss": 0.0019, | |
| "reward": 1.6091580390930176, | |
| "reward_std": 0.2114795446395874, | |
| "rewards/accuracy_reward": 0.6130642294883728, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 446 | |
| }, | |
| { | |
| "all_correct": 0.5, | |
| "all_wrong": 0.15625, | |
| "completion_length": 93.82421875, | |
| "epoch": 0.8449905482041588, | |
| "grad_norm": 1.5499881081679403, | |
| "kl": 0.046142578125, | |
| "learning_rate": 1.1624826486487872e-07, | |
| "loss": 0.0018, | |
| "reward": 1.68212890625, | |
| "reward_std": 0.1655098795890808, | |
| "rewards/accuracy_reward": 0.68994140625, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 447 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.1875, | |
| "completion_length": 96.47265625, | |
| "epoch": 0.8468809073724007, | |
| "grad_norm": 1.3637912127369223, | |
| "kl": 0.04248046875, | |
| "learning_rate": 1.134847968678575e-07, | |
| "loss": 0.0017, | |
| "reward": 1.6614583730697632, | |
| "reward_std": 0.15649890899658203, | |
| "rewards/accuracy_reward": 0.6809896230697632, | |
| "rewards/format_reward": 0.98046875, | |
| "step": 448 | |
| }, | |
| { | |
| "all_correct": 0.5, | |
| "all_wrong": 0.21875, | |
| "completion_length": 90.0703125, | |
| "epoch": 0.8487712665406427, | |
| "grad_norm": 1.1591726049731972, | |
| "kl": 0.0390625, | |
| "learning_rate": 1.1075259494205225e-07, | |
| "loss": 0.0016, | |
| "reward": 1.655552625656128, | |
| "reward_std": 0.12017819285392761, | |
| "rewards/accuracy_reward": 0.6594588756561279, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 449 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.15625, | |
| "completion_length": 92.24609375, | |
| "epoch": 0.8506616257088847, | |
| "grad_norm": 2.039439334103359, | |
| "kl": 0.0419921875, | |
| "learning_rate": 1.0805175544815648e-07, | |
| "loss": 0.0017, | |
| "reward": 1.6180661916732788, | |
| "reward_std": 0.16808518767356873, | |
| "rewards/accuracy_reward": 0.6180662512779236, | |
| "rewards/format_reward": 1.0, | |
| "step": 450 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.09375, | |
| "completion_length": 87.765625, | |
| "epoch": 0.8525519848771267, | |
| "grad_norm": 3.551379163758167, | |
| "kl": 0.048095703125, | |
| "learning_rate": 1.0538237364075786e-07, | |
| "loss": 0.0019, | |
| "reward": 1.7047264575958252, | |
| "reward_std": 0.17995205521583557, | |
| "rewards/accuracy_reward": 0.7047264575958252, | |
| "rewards/format_reward": 1.0, | |
| "step": 451 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.1875, | |
| "completion_length": 94.16796875, | |
| "epoch": 0.8544423440453687, | |
| "grad_norm": 2.4552903396429553, | |
| "kl": 0.04296875, | |
| "learning_rate": 1.0274454366497787e-07, | |
| "loss": 0.0017, | |
| "reward": 1.5992646217346191, | |
| "reward_std": 0.19959133863449097, | |
| "rewards/accuracy_reward": 0.5992645621299744, | |
| "rewards/format_reward": 1.0, | |
| "step": 452 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 98.84375, | |
| "epoch": 0.8563327032136105, | |
| "grad_norm": 1.9864652083005676, | |
| "kl": 0.04296875, | |
| "learning_rate": 1.0013835855315233e-07, | |
| "loss": 0.0017, | |
| "reward": 1.547175407409668, | |
| "reward_std": 0.18141832947731018, | |
| "rewards/accuracy_reward": 0.5471754670143127, | |
| "rewards/format_reward": 1.0, | |
| "step": 453 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 91.21484375, | |
| "epoch": 0.8582230623818525, | |
| "grad_norm": 1.4554139396903383, | |
| "kl": 0.04345703125, | |
| "learning_rate": 9.756391022154953e-08, | |
| "loss": 0.0017, | |
| "reward": 1.6482672691345215, | |
| "reward_std": 0.20143428444862366, | |
| "rewards/accuracy_reward": 0.6482672095298767, | |
| "rewards/format_reward": 1.0, | |
| "step": 454 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.21875, | |
| "completion_length": 91.6875, | |
| "epoch": 0.8601134215500945, | |
| "grad_norm": 1.3848840834631115, | |
| "kl": 0.05029296875, | |
| "learning_rate": 9.502128946712862e-08, | |
| "loss": 0.002, | |
| "reward": 1.5533459186553955, | |
| "reward_std": 0.1582801640033722, | |
| "rewards/accuracy_reward": 0.5572521090507507, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 455 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.28125, | |
| "completion_length": 87.30078125, | |
| "epoch": 0.8620037807183365, | |
| "grad_norm": 1.474494768169295, | |
| "kl": 0.0390625, | |
| "learning_rate": 9.251058596433792e-08, | |
| "loss": 0.0016, | |
| "reward": 1.566476821899414, | |
| "reward_std": 0.08095038682222366, | |
| "rewards/accuracy_reward": 0.5664768218994141, | |
| "rewards/format_reward": 1.0, | |
| "step": 456 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.0625, | |
| "completion_length": 92.46484375, | |
| "epoch": 0.8638941398865785, | |
| "grad_norm": 2.228512668796117, | |
| "kl": 0.04150390625, | |
| "learning_rate": 9.003188826195141e-08, | |
| "loss": 0.0017, | |
| "reward": 1.6768805980682373, | |
| "reward_std": 0.21106885373592377, | |
| "rewards/accuracy_reward": 0.6768805384635925, | |
| "rewards/format_reward": 1.0, | |
| "step": 457 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.25, | |
| "completion_length": 89.76953125, | |
| "epoch": 0.8657844990548205, | |
| "grad_norm": 2.381172656339536, | |
| "kl": 0.043701171875, | |
| "learning_rate": 8.758528377994667e-08, | |
| "loss": 0.0017, | |
| "reward": 1.5411908626556396, | |
| "reward_std": 0.182044118642807, | |
| "rewards/accuracy_reward": 0.5411908626556396, | |
| "rewards/format_reward": 1.0, | |
| "step": 458 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.21875, | |
| "completion_length": 97.53125, | |
| "epoch": 0.8676748582230623, | |
| "grad_norm": 1.1370034333672576, | |
| "kl": 0.042724609375, | |
| "learning_rate": 8.51708588064206e-08, | |
| "loss": 0.0017, | |
| "reward": 1.5652220249176025, | |
| "reward_std": 0.13882781565189362, | |
| "rewards/accuracy_reward": 0.5730345249176025, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 459 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.28125, | |
| "completion_length": 99.2421875, | |
| "epoch": 0.8695652173913043, | |
| "grad_norm": 1.9476181687483543, | |
| "kl": 0.041259765625, | |
| "learning_rate": 8.278869849454717e-08, | |
| "loss": 0.0017, | |
| "reward": 1.478024959564209, | |
| "reward_std": 0.17786462604999542, | |
| "rewards/accuracy_reward": 0.47802501916885376, | |
| "rewards/format_reward": 1.0, | |
| "step": 460 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 100.20703125, | |
| "epoch": 0.8714555765595463, | |
| "grad_norm": 1.224210562302336, | |
| "kl": 0.0380859375, | |
| "learning_rate": 8.043888685957312e-08, | |
| "loss": 0.0015, | |
| "reward": 1.5925004482269287, | |
| "reward_std": 0.14712047576904297, | |
| "rewards/accuracy_reward": 0.5925004482269287, | |
| "rewards/format_reward": 1.0, | |
| "step": 461 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.15625, | |
| "completion_length": 91.64453125, | |
| "epoch": 0.8733459357277883, | |
| "grad_norm": 4.6192400629488395, | |
| "kl": 0.045654296875, | |
| "learning_rate": 7.812150677585671e-08, | |
| "loss": 0.0018, | |
| "reward": 1.5704599618911743, | |
| "reward_std": 0.1768045723438263, | |
| "rewards/accuracy_reward": 0.5743662118911743, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 462 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.28125, | |
| "completion_length": 94.3828125, | |
| "epoch": 0.8752362948960303, | |
| "grad_norm": 1.9953801863403324, | |
| "kl": 0.049560546875, | |
| "learning_rate": 7.58366399739424e-08, | |
| "loss": 0.002, | |
| "reward": 1.5050649642944336, | |
| "reward_std": 0.11211474239826202, | |
| "rewards/accuracy_reward": 0.5089712142944336, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 463 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.125, | |
| "completion_length": 90.1640625, | |
| "epoch": 0.8771266540642723, | |
| "grad_norm": 1.8235799095393221, | |
| "kl": 0.043212890625, | |
| "learning_rate": 7.358436703768034e-08, | |
| "loss": 0.0017, | |
| "reward": 1.589560627937317, | |
| "reward_std": 0.16465333104133606, | |
| "rewards/accuracy_reward": 0.5895605683326721, | |
| "rewards/format_reward": 1.0, | |
| "step": 464 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.21875, | |
| "completion_length": 95.63671875, | |
| "epoch": 0.8790170132325141, | |
| "grad_norm": 1.0760041562687386, | |
| "kl": 0.0380859375, | |
| "learning_rate": 7.136476740138387e-08, | |
| "loss": 0.0015, | |
| "reward": 1.5880682468414307, | |
| "reward_std": 0.15734529495239258, | |
| "rewards/accuracy_reward": 0.6154119968414307, | |
| "rewards/format_reward": 0.97265625, | |
| "step": 465 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.25, | |
| "completion_length": 101.12890625, | |
| "epoch": 0.8809073724007561, | |
| "grad_norm": 1.4738314342767895, | |
| "kl": 0.044189453125, | |
| "learning_rate": 6.917791934702655e-08, | |
| "loss": 0.0018, | |
| "reward": 1.4817919731140137, | |
| "reward_std": 0.2077905535697937, | |
| "rewards/accuracy_reward": 0.48179197311401367, | |
| "rewards/format_reward": 1.0, | |
| "step": 466 | |
| }, | |
| { | |
| "all_correct": 0.53125, | |
| "all_wrong": 0.15625, | |
| "completion_length": 95.58203125, | |
| "epoch": 0.8827977315689981, | |
| "grad_norm": 1.0571716079445075, | |
| "kl": 0.04296875, | |
| "learning_rate": 6.70239000014835e-08, | |
| "loss": 0.0017, | |
| "reward": 1.6927690505981445, | |
| "reward_std": 0.14029529690742493, | |
| "rewards/accuracy_reward": 0.6966753005981445, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 467 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.21875, | |
| "completion_length": 97.92578125, | |
| "epoch": 0.8846880907372401, | |
| "grad_norm": 1.5234291263564896, | |
| "kl": 0.0458984375, | |
| "learning_rate": 6.490278533380955e-08, | |
| "loss": 0.0018, | |
| "reward": 1.540100336074829, | |
| "reward_std": 0.2111871838569641, | |
| "rewards/accuracy_reward": 0.5635378360748291, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 468 | |
| }, | |
| { | |
| "all_correct": 0.375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 83.59375, | |
| "epoch": 0.8865784499054821, | |
| "grad_norm": 2.0255739847362766, | |
| "kl": 0.049560546875, | |
| "learning_rate": 6.281465015256093e-08, | |
| "loss": 0.002, | |
| "reward": 1.5604361295700073, | |
| "reward_std": 0.17061945796012878, | |
| "rewards/accuracy_reward": 0.5604361891746521, | |
| "rewards/format_reward": 1.0, | |
| "step": 469 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.1875, | |
| "completion_length": 103.4921875, | |
| "epoch": 0.888468809073724, | |
| "grad_norm": 1.4503561746862432, | |
| "kl": 0.044921875, | |
| "learning_rate": 6.075956810315619e-08, | |
| "loss": 0.0018, | |
| "reward": 1.599052906036377, | |
| "reward_std": 0.1590302437543869, | |
| "rewards/accuracy_reward": 0.5990527868270874, | |
| "rewards/format_reward": 1.0, | |
| "step": 470 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 85.69921875, | |
| "epoch": 0.8903591682419659, | |
| "grad_norm": 1.9843134997693552, | |
| "kl": 0.052001953125, | |
| "learning_rate": 5.8737611665279355e-08, | |
| "loss": 0.0021, | |
| "reward": 1.624929666519165, | |
| "reward_std": 0.15200699865818024, | |
| "rewards/accuracy_reward": 0.624929666519165, | |
| "rewards/format_reward": 1.0, | |
| "step": 471 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.125, | |
| "completion_length": 95.69921875, | |
| "epoch": 0.8922495274102079, | |
| "grad_norm": 1.9277786145166012, | |
| "kl": 0.0439453125, | |
| "learning_rate": 5.6748852150323215e-08, | |
| "loss": 0.0018, | |
| "reward": 1.5506601333618164, | |
| "reward_std": 0.21757060289382935, | |
| "rewards/accuracy_reward": 0.5506601333618164, | |
| "rewards/format_reward": 1.0, | |
| "step": 472 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.25, | |
| "completion_length": 94.9140625, | |
| "epoch": 0.8941398865784499, | |
| "grad_norm": 1.3171715987687558, | |
| "kl": 0.04150390625, | |
| "learning_rate": 5.479335969887466e-08, | |
| "loss": 0.0017, | |
| "reward": 1.486616611480713, | |
| "reward_std": 0.17591437697410583, | |
| "rewards/accuracy_reward": 0.49052292108535767, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 473 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.34375, | |
| "completion_length": 92.6640625, | |
| "epoch": 0.8960302457466919, | |
| "grad_norm": 1.1864505239231775, | |
| "kl": 0.05126953125, | |
| "learning_rate": 5.2871203278240906e-08, | |
| "loss": 0.002, | |
| "reward": 1.528516173362732, | |
| "reward_std": 0.11142821609973907, | |
| "rewards/accuracy_reward": 0.5285161733627319, | |
| "rewards/format_reward": 1.0, | |
| "step": 474 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.1875, | |
| "completion_length": 99.765625, | |
| "epoch": 0.8979206049149339, | |
| "grad_norm": 3.793748033276946, | |
| "kl": 0.0390625, | |
| "learning_rate": 5.098245068001661e-08, | |
| "loss": 0.0016, | |
| "reward": 1.4771013259887695, | |
| "reward_std": 0.22667381167411804, | |
| "rewards/accuracy_reward": 0.47710129618644714, | |
| "rewards/format_reward": 1.0, | |
| "step": 475 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.1875, | |
| "completion_length": 83.51953125, | |
| "epoch": 0.8998109640831758, | |
| "grad_norm": 6.850505412934267, | |
| "kl": 0.044921875, | |
| "learning_rate": 4.912716851769394e-08, | |
| "loss": 0.0018, | |
| "reward": 1.649026870727539, | |
| "reward_std": 0.15467888116836548, | |
| "rewards/accuracy_reward": 0.6490268707275391, | |
| "rewards/format_reward": 1.0, | |
| "step": 476 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 87.7109375, | |
| "epoch": 0.9017013232514177, | |
| "grad_norm": 3.519242696315356, | |
| "kl": 0.044677734375, | |
| "learning_rate": 4.730542222431222e-08, | |
| "loss": 0.0018, | |
| "reward": 1.6143535375595093, | |
| "reward_std": 0.17040878534317017, | |
| "rewards/accuracy_reward": 0.6143535375595093, | |
| "rewards/format_reward": 1.0, | |
| "step": 477 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 90.99609375, | |
| "epoch": 0.9035916824196597, | |
| "grad_norm": 3.880434261673006, | |
| "kl": 0.038818359375, | |
| "learning_rate": 4.5517276050150325e-08, | |
| "loss": 0.0016, | |
| "reward": 1.5734894275665283, | |
| "reward_std": 0.16900530457496643, | |
| "rewards/accuracy_reward": 0.5734893679618835, | |
| "rewards/format_reward": 1.0, | |
| "step": 478 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.0625, | |
| "completion_length": 90.35546875, | |
| "epoch": 0.9054820415879017, | |
| "grad_norm": 1.8707086039575145, | |
| "kl": 0.041015625, | |
| "learning_rate": 4.3762793060461824e-08, | |
| "loss": 0.0016, | |
| "reward": 1.6607571840286255, | |
| "reward_std": 0.23725268244743347, | |
| "rewards/accuracy_reward": 0.6763821840286255, | |
| "rewards/format_reward": 0.984375, | |
| "step": 479 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.1875, | |
| "completion_length": 87.125, | |
| "epoch": 0.9073724007561437, | |
| "grad_norm": 2.978657513471123, | |
| "kl": 0.048583984375, | |
| "learning_rate": 4.2042035133248885e-08, | |
| "loss": 0.0019, | |
| "reward": 1.4891417026519775, | |
| "reward_std": 0.2210281491279602, | |
| "rewards/accuracy_reward": 0.48914170265197754, | |
| "rewards/format_reward": 1.0, | |
| "step": 480 | |
| }, | |
| { | |
| "all_correct": 0.21875, | |
| "all_wrong": 0.21875, | |
| "completion_length": 88.83984375, | |
| "epoch": 0.9092627599243857, | |
| "grad_norm": 1.9106697468452067, | |
| "kl": 0.0458984375, | |
| "learning_rate": 4.035506295708191e-08, | |
| "loss": 0.0018, | |
| "reward": 1.4589961767196655, | |
| "reward_std": 0.21063633263111115, | |
| "rewards/accuracy_reward": 0.46290236711502075, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 481 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.15625, | |
| "completion_length": 95.41796875, | |
| "epoch": 0.9111531190926276, | |
| "grad_norm": 3.468786099498894, | |
| "kl": 0.046875, | |
| "learning_rate": 3.870193602895733e-08, | |
| "loss": 0.0019, | |
| "reward": 1.6448715925216675, | |
| "reward_std": 0.16561608016490936, | |
| "rewards/accuracy_reward": 0.6448715329170227, | |
| "rewards/format_reward": 1.0, | |
| "step": 482 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.15625, | |
| "completion_length": 96.2109375, | |
| "epoch": 0.9130434782608695, | |
| "grad_norm": 2.104789227248534, | |
| "kl": 0.0361328125, | |
| "learning_rate": 3.708271265220087e-08, | |
| "loss": 0.0015, | |
| "reward": 1.6773532629013062, | |
| "reward_std": 0.16596126556396484, | |
| "rewards/accuracy_reward": 0.7046970725059509, | |
| "rewards/format_reward": 0.97265625, | |
| "step": 483 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.28125, | |
| "completion_length": 89.72265625, | |
| "epoch": 0.9149338374291115, | |
| "grad_norm": 6.692599057258857, | |
| "kl": 0.042236328125, | |
| "learning_rate": 3.5497449934409396e-08, | |
| "loss": 0.0017, | |
| "reward": 1.45703125, | |
| "reward_std": 0.19940373301506042, | |
| "rewards/accuracy_reward": 0.46484375, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 484 | |
| }, | |
| { | |
| "all_correct": 0.5, | |
| "all_wrong": 0.15625, | |
| "completion_length": 89.53515625, | |
| "epoch": 0.9168241965973535, | |
| "grad_norm": 1.8841313081503308, | |
| "kl": 0.05078125, | |
| "learning_rate": 3.394620378543911e-08, | |
| "loss": 0.002, | |
| "reward": 1.6709468364715576, | |
| "reward_std": 0.13965073227882385, | |
| "rewards/accuracy_reward": 0.6826655268669128, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 485 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.25, | |
| "completion_length": 86.93359375, | |
| "epoch": 0.9187145557655955, | |
| "grad_norm": 1.673566002692786, | |
| "kl": 0.046142578125, | |
| "learning_rate": 3.2429028915431534e-08, | |
| "loss": 0.0018, | |
| "reward": 1.5806677341461182, | |
| "reward_std": 0.13359013199806213, | |
| "rewards/accuracy_reward": 0.5806676149368286, | |
| "rewards/format_reward": 1.0, | |
| "step": 486 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 88.890625, | |
| "epoch": 0.9206049149338374, | |
| "grad_norm": 4.461429054725898, | |
| "kl": 0.041259765625, | |
| "learning_rate": 3.094597883288574e-08, | |
| "loss": 0.0016, | |
| "reward": 1.56640625, | |
| "reward_std": 0.2223491370677948, | |
| "rewards/accuracy_reward": 0.5703125, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 487 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.09375, | |
| "completion_length": 99.546875, | |
| "epoch": 0.9224952741020794, | |
| "grad_norm": 2.074395570322666, | |
| "kl": 0.04638671875, | |
| "learning_rate": 2.9497105842769433e-08, | |
| "loss": 0.0019, | |
| "reward": 1.6248832941055298, | |
| "reward_std": 0.2539004683494568, | |
| "rewards/accuracy_reward": 0.636601984500885, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 488 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.25, | |
| "completion_length": 84.6328125, | |
| "epoch": 0.9243856332703214, | |
| "grad_norm": 1.4101059267718767, | |
| "kl": 0.04833984375, | |
| "learning_rate": 2.808246104467582e-08, | |
| "loss": 0.0019, | |
| "reward": 1.5731714963912964, | |
| "reward_std": 0.1267606019973755, | |
| "rewards/accuracy_reward": 0.5731715559959412, | |
| "rewards/format_reward": 1.0, | |
| "step": 489 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.125, | |
| "completion_length": 89.83203125, | |
| "epoch": 0.9262759924385633, | |
| "grad_norm": 2.2978085302274267, | |
| "kl": 0.04541015625, | |
| "learning_rate": 2.6702094331020886e-08, | |
| "loss": 0.0018, | |
| "reward": 1.6596397161483765, | |
| "reward_std": 0.18694524466991425, | |
| "rewards/accuracy_reward": 0.6596397161483765, | |
| "rewards/format_reward": 1.0, | |
| "step": 490 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.28125, | |
| "completion_length": 91.2265625, | |
| "epoch": 0.9281663516068053, | |
| "grad_norm": 1.207942110015228, | |
| "kl": 0.044677734375, | |
| "learning_rate": 2.5356054385282766e-08, | |
| "loss": 0.0018, | |
| "reward": 1.611553430557251, | |
| "reward_std": 0.09651355445384979, | |
| "rewards/accuracy_reward": 0.611553430557251, | |
| "rewards/format_reward": 1.0, | |
| "step": 491 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.0625, | |
| "completion_length": 100.76953125, | |
| "epoch": 0.9300567107750473, | |
| "grad_norm": 1.46194048244851, | |
| "kl": 0.039794921875, | |
| "learning_rate": 2.4044388680286575e-08, | |
| "loss": 0.0016, | |
| "reward": 1.69921875, | |
| "reward_std": 0.25242602825164795, | |
| "rewards/accuracy_reward": 0.7109375, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 492 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.03125, | |
| "completion_length": 103.296875, | |
| "epoch": 0.9319470699432892, | |
| "grad_norm": 1.6142721258162138, | |
| "kl": 0.03857421875, | |
| "learning_rate": 2.2767143476528306e-08, | |
| "loss": 0.0015, | |
| "reward": 1.6534472703933716, | |
| "reward_std": 0.2080773115158081, | |
| "rewards/accuracy_reward": 0.6690722107887268, | |
| "rewards/format_reward": 0.984375, | |
| "step": 493 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.0625, | |
| "completion_length": 96.51171875, | |
| "epoch": 0.9338374291115312, | |
| "grad_norm": 1.5128739509537954, | |
| "kl": 0.037841796875, | |
| "learning_rate": 2.152436382054479e-08, | |
| "loss": 0.0015, | |
| "reward": 1.6529420614242554, | |
| "reward_std": 0.22081422805786133, | |
| "rewards/accuracy_reward": 0.6607545614242554, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 494 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 88.4765625, | |
| "epoch": 0.9357277882797732, | |
| "grad_norm": 1.4350308350145478, | |
| "kl": 0.05126953125, | |
| "learning_rate": 2.0316093543323753e-08, | |
| "loss": 0.0021, | |
| "reward": 1.5785757303237915, | |
| "reward_std": 0.1358921080827713, | |
| "rewards/accuracy_reward": 0.5785757303237915, | |
| "rewards/format_reward": 1.0, | |
| "step": 495 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.28125, | |
| "completion_length": 91.57421875, | |
| "epoch": 0.9376181474480151, | |
| "grad_norm": 1.4943511024476006, | |
| "kl": 0.042236328125, | |
| "learning_rate": 1.914237525875917e-08, | |
| "loss": 0.0017, | |
| "reward": 1.4834184646606445, | |
| "reward_std": 0.1201879009604454, | |
| "rewards/accuracy_reward": 0.48341846466064453, | |
| "rewards/format_reward": 1.0, | |
| "step": 496 | |
| }, | |
| { | |
| "all_correct": 0.5, | |
| "all_wrong": 0.1875, | |
| "completion_length": 90.48046875, | |
| "epoch": 0.9395085066162571, | |
| "grad_norm": 1.2851391631205682, | |
| "kl": 0.040283203125, | |
| "learning_rate": 1.8003250362147004e-08, | |
| "loss": 0.0016, | |
| "reward": 1.651926040649414, | |
| "reward_std": 0.11862440407276154, | |
| "rewards/accuracy_reward": 0.6519260406494141, | |
| "rewards/format_reward": 1.0, | |
| "step": 497 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.25, | |
| "completion_length": 91.71484375, | |
| "epoch": 0.941398865784499, | |
| "grad_norm": 3.68427646191117, | |
| "kl": 0.04296875, | |
| "learning_rate": 1.6898759028726283e-08, | |
| "loss": 0.0017, | |
| "reward": 1.55078125, | |
| "reward_std": 0.19926638901233673, | |
| "rewards/accuracy_reward": 0.5546875, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 498 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.25, | |
| "completion_length": 88.03515625, | |
| "epoch": 0.943289224952741, | |
| "grad_norm": 1.9461501149892964, | |
| "kl": 0.045654296875, | |
| "learning_rate": 1.5828940212261887e-08, | |
| "loss": 0.0018, | |
| "reward": 1.4915789365768433, | |
| "reward_std": 0.1742965131998062, | |
| "rewards/accuracy_reward": 0.49157893657684326, | |
| "rewards/format_reward": 1.0, | |
| "step": 499 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.21875, | |
| "completion_length": 104.78515625, | |
| "epoch": 0.945179584120983, | |
| "grad_norm": 1.4451954837871217, | |
| "kl": 0.040283203125, | |
| "learning_rate": 1.4793831643670429e-08, | |
| "loss": 0.0016, | |
| "reward": 1.5099118947982788, | |
| "reward_std": 0.25256532430648804, | |
| "rewards/accuracy_reward": 0.5294432044029236, | |
| "rewards/format_reward": 0.98046875, | |
| "step": 500 | |
| }, | |
| { | |
| "all_correct": 0.46875, | |
| "all_wrong": 0.21875, | |
| "completion_length": 100.34765625, | |
| "epoch": 0.947069943289225, | |
| "grad_norm": 1.6974654768712778, | |
| "kl": 0.039794921875, | |
| "learning_rate": 1.3793469829689986e-08, | |
| "loss": 0.0016, | |
| "reward": 1.6564844846725464, | |
| "reward_std": 0.11369632184505463, | |
| "rewards/accuracy_reward": 0.6564844846725464, | |
| "rewards/format_reward": 1.0, | |
| "step": 501 | |
| }, | |
| { | |
| "all_correct": 0.34375, | |
| "all_wrong": 0.1875, | |
| "completion_length": 90.28515625, | |
| "epoch": 0.9489603024574669, | |
| "grad_norm": 1.725404425317813, | |
| "kl": 0.040283203125, | |
| "learning_rate": 1.2827890051592127e-08, | |
| "loss": 0.0016, | |
| "reward": 1.6119554042816162, | |
| "reward_std": 0.16685199737548828, | |
| "rewards/accuracy_reward": 0.6119554042816162, | |
| "rewards/format_reward": 1.0, | |
| "step": 502 | |
| }, | |
| { | |
| "all_correct": 0.3125, | |
| "all_wrong": 0.21875, | |
| "completion_length": 100.02734375, | |
| "epoch": 0.9508506616257089, | |
| "grad_norm": 1.6905254249447337, | |
| "kl": 0.04150390625, | |
| "learning_rate": 1.1897126363937803e-08, | |
| "loss": 0.0017, | |
| "reward": 1.6142412424087524, | |
| "reward_std": 0.18831773102283478, | |
| "rewards/accuracy_reward": 0.6142412424087524, | |
| "rewards/format_reward": 1.0, | |
| "step": 503 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.15625, | |
| "completion_length": 97.78125, | |
| "epoch": 0.9527410207939508, | |
| "grad_norm": 1.4091412694145533, | |
| "kl": 0.0390625, | |
| "learning_rate": 1.1001211593376525e-08, | |
| "loss": 0.0016, | |
| "reward": 1.5735445022583008, | |
| "reward_std": 0.17769688367843628, | |
| "rewards/accuracy_reward": 0.577450692653656, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 504 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.1875, | |
| "completion_length": 81.171875, | |
| "epoch": 0.9546313799621928, | |
| "grad_norm": 2.0493717412947645, | |
| "kl": 0.04443359375, | |
| "learning_rate": 1.0140177337488287e-08, | |
| "loss": 0.0018, | |
| "reward": 1.6359052658081055, | |
| "reward_std": 0.15704122185707092, | |
| "rewards/accuracy_reward": 0.6359052062034607, | |
| "rewards/format_reward": 1.0, | |
| "step": 505 | |
| }, | |
| { | |
| "all_correct": 0.1875, | |
| "all_wrong": 0.1875, | |
| "completion_length": 91.32421875, | |
| "epoch": 0.9565217391304348, | |
| "grad_norm": 3.7195758010441407, | |
| "kl": 0.047607421875, | |
| "learning_rate": 9.314053963669244e-09, | |
| "loss": 0.0019, | |
| "reward": 1.5355010032653809, | |
| "reward_std": 0.20620205998420715, | |
| "rewards/accuracy_reward": 0.5355010628700256, | |
| "rewards/format_reward": 1.0, | |
| "step": 506 | |
| }, | |
| { | |
| "all_correct": 0.40625, | |
| "all_wrong": 0.125, | |
| "completion_length": 93.41796875, | |
| "epoch": 0.9584120982986768, | |
| "grad_norm": 4.273675771479045, | |
| "kl": 0.0419921875, | |
| "learning_rate": 8.522870608060562e-09, | |
| "loss": 0.0017, | |
| "reward": 1.6529306173324585, | |
| "reward_std": 0.17892761528491974, | |
| "rewards/accuracy_reward": 0.6529306173324585, | |
| "rewards/format_reward": 1.0, | |
| "step": 507 | |
| }, | |
| { | |
| "all_correct": 0.4375, | |
| "all_wrong": 0.15625, | |
| "completion_length": 95.71875, | |
| "epoch": 0.9603024574669187, | |
| "grad_norm": 1.471506691627891, | |
| "kl": 0.045166015625, | |
| "learning_rate": 7.766655174521464e-09, | |
| "loss": 0.0018, | |
| "reward": 1.6168668270111084, | |
| "reward_std": 0.13944411277770996, | |
| "rewards/accuracy_reward": 0.6168668270111084, | |
| "rewards/format_reward": 1.0, | |
| "step": 508 | |
| }, | |
| { | |
| "all_correct": 0.25, | |
| "all_wrong": 0.21875, | |
| "completion_length": 94.5, | |
| "epoch": 0.9621928166351607, | |
| "grad_norm": 3.051208934788439, | |
| "kl": 0.042236328125, | |
| "learning_rate": 7.045434333643796e-09, | |
| "loss": 0.0017, | |
| "reward": 1.5761425495147705, | |
| "reward_std": 0.22245533764362335, | |
| "rewards/accuracy_reward": 0.5761424899101257, | |
| "rewards/format_reward": 1.0, | |
| "step": 509 | |
| }, | |
| { | |
| "all_correct": 0.28125, | |
| "all_wrong": 0.15625, | |
| "completion_length": 90.06640625, | |
| "epoch": 0.9640831758034026, | |
| "grad_norm": 1.9014046704475331, | |
| "kl": 0.042724609375, | |
| "learning_rate": 6.3592335218132235e-09, | |
| "loss": 0.0017, | |
| "reward": 1.5352835655212402, | |
| "reward_std": 0.22351181507110596, | |
| "rewards/accuracy_reward": 0.5352836847305298, | |
| "rewards/format_reward": 1.0, | |
| "step": 510 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 529, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 510, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |