Video-KTR-7B / trainer_state.json
Ziyue2022's picture
Upload folder using huggingface_hub
a23a8a9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9640831758034026,
"eval_steps": 500,
"global_step": 510,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"all_correct": 0.0625,
"all_wrong": 0.0625,
"completion_length": 95.96875,
"epoch": 0.001890359168241966,
"grad_norm": 2.3602562531627758,
"kl": 0.0,
"learning_rate": 1.999982365744487e-06,
"loss": 0.0,
"reward": 1.514066219329834,
"reward_std": 0.375105082988739,
"rewards/accuracy_reward": 0.5140663385391235,
"rewards/format_reward": 1.0,
"step": 1
},
{
"all_correct": 0.0625,
"all_wrong": 0.09375,
"completion_length": 92.3515625,
"epoch": 0.003780718336483932,
"grad_norm": 6.640343536616842,
"kl": 0.0013275146484375,
"learning_rate": 1.999929463599883e-06,
"loss": 0.0001,
"reward": 1.417905330657959,
"reward_std": 0.37063267827033997,
"rewards/accuracy_reward": 0.42181164026260376,
"rewards/format_reward": 0.99609375,
"step": 2
},
{
"all_correct": 0.0625,
"all_wrong": 0.125,
"completion_length": 90.35546875,
"epoch": 0.005671077504725898,
"grad_norm": 2.414094297005193,
"kl": 0.00127410888671875,
"learning_rate": 1.9998412954319676e-06,
"loss": 0.0001,
"reward": 1.4140625,
"reward_std": 0.3621976673603058,
"rewards/accuracy_reward": 0.4140625,
"rewards/format_reward": 1.0,
"step": 3
},
{
"all_correct": 0.09375,
"all_wrong": 0.1875,
"completion_length": 92.7734375,
"epoch": 0.007561436672967864,
"grad_norm": 2.2244626207569187,
"kl": 0.00150299072265625,
"learning_rate": 1.9997178643503e-06,
"loss": 0.0001,
"reward": 1.408446192741394,
"reward_std": 0.3336002826690674,
"rewards/accuracy_reward": 0.40844619274139404,
"rewards/format_reward": 1.0,
"step": 4
},
{
"all_correct": 0.125,
"all_wrong": 0.0625,
"completion_length": 89.6640625,
"epoch": 0.00945179584120983,
"grad_norm": 2.3915039557721607,
"kl": 0.0021209716796875,
"learning_rate": 1.999559174708112e-06,
"loss": 0.0001,
"reward": 1.5034363269805908,
"reward_std": 0.37499552965164185,
"rewards/accuracy_reward": 0.5073425769805908,
"rewards/format_reward": 0.99609375,
"step": 5
},
{
"all_correct": 0.03125,
"all_wrong": 0.09375,
"completion_length": 88.1875,
"epoch": 0.011342155009451797,
"grad_norm": 4.082221628547588,
"kl": 0.00335693359375,
"learning_rate": 1.99936523210215e-06,
"loss": 0.0001,
"reward": 1.4816137552261353,
"reward_std": 0.3769935369491577,
"rewards/accuracy_reward": 0.48161375522613525,
"rewards/format_reward": 1.0,
"step": 6
},
{
"all_correct": 0.15625,
"all_wrong": 0.03125,
"completion_length": 84.96875,
"epoch": 0.013232514177693762,
"grad_norm": 2.411737580254065,
"kl": 0.00433349609375,
"learning_rate": 1.999136043372481e-06,
"loss": 0.0002,
"reward": 1.6246747970581055,
"reward_std": 0.3329503536224365,
"rewards/accuracy_reward": 0.6246747970581055,
"rewards/format_reward": 1.0,
"step": 7
},
{
"all_correct": 0.09375,
"all_wrong": 0.125,
"completion_length": 86.140625,
"epoch": 0.015122873345935728,
"grad_norm": 2.5943017844391107,
"kl": 0.00537109375,
"learning_rate": 1.9988716166022506e-06,
"loss": 0.0002,
"reward": 1.4784858226776123,
"reward_std": 0.3560720384120941,
"rewards/accuracy_reward": 0.4784858524799347,
"rewards/format_reward": 1.0,
"step": 8
},
{
"all_correct": 0.0625,
"all_wrong": 0.09375,
"completion_length": 82.1328125,
"epoch": 0.017013232514177693,
"grad_norm": 2.1704796065424508,
"kl": 0.00628662109375,
"learning_rate": 1.998571961117397e-06,
"loss": 0.0003,
"reward": 1.4728260040283203,
"reward_std": 0.3873444199562073,
"rewards/accuracy_reward": 0.47282594442367554,
"rewards/format_reward": 1.0,
"step": 9
},
{
"all_correct": 0.125,
"all_wrong": 0.09375,
"completion_length": 90.22265625,
"epoch": 0.01890359168241966,
"grad_norm": 1.9312909564522398,
"kl": 0.0068359375,
"learning_rate": 1.9982370874863233e-06,
"loss": 0.0003,
"reward": 1.5304478406906128,
"reward_std": 0.3415384888648987,
"rewards/accuracy_reward": 0.5343540906906128,
"rewards/format_reward": 0.99609375,
"step": 10
},
{
"all_correct": 0.09375,
"all_wrong": 0.0625,
"completion_length": 99.89453125,
"epoch": 0.020793950850661626,
"grad_norm": 2.183103416981468,
"kl": 0.0074462890625,
"learning_rate": 1.9978670075195237e-06,
"loss": 0.0003,
"reward": 1.5692423582077026,
"reward_std": 0.39161738753318787,
"rewards/accuracy_reward": 0.5848673582077026,
"rewards/format_reward": 0.984375,
"step": 11
},
{
"all_correct": 0.0,
"all_wrong": 0.0,
"completion_length": 93.22265625,
"epoch": 0.022684310018903593,
"grad_norm": 8.227068776940921,
"kl": 0.00958251953125,
"learning_rate": 1.9974617342691674e-06,
"loss": 0.0004,
"reward": 1.468308687210083,
"reward_std": 0.46158915758132935,
"rewards/accuracy_reward": 0.47612112760543823,
"rewards/format_reward": 0.9921875,
"step": 12
},
{
"all_correct": 0.09375,
"all_wrong": 0.0625,
"completion_length": 87.81640625,
"epoch": 0.024574669187145556,
"grad_norm": 3.223495772387027,
"kl": 0.01129150390625,
"learning_rate": 1.997021282028639e-06,
"loss": 0.0005,
"reward": 1.5035626888275146,
"reward_std": 0.3581737279891968,
"rewards/accuracy_reward": 0.5035626888275146,
"rewards/format_reward": 1.0,
"step": 13
},
{
"all_correct": 0.21875,
"all_wrong": 0.0625,
"completion_length": 88.83984375,
"epoch": 0.026465028355387523,
"grad_norm": 3.1552889851956594,
"kl": 0.0126953125,
"learning_rate": 1.9965456663320324e-06,
"loss": 0.0005,
"reward": 1.507861852645874,
"reward_std": 0.294893354177475,
"rewards/accuracy_reward": 0.507861852645874,
"rewards/format_reward": 1.0,
"step": 14
},
{
"all_correct": 0.03125,
"all_wrong": 0.125,
"completion_length": 92.70703125,
"epoch": 0.02835538752362949,
"grad_norm": 2.6633455753050947,
"kl": 0.011962890625,
"learning_rate": 1.996034903953606e-06,
"loss": 0.0005,
"reward": 1.403101921081543,
"reward_std": 0.35901227593421936,
"rewards/accuracy_reward": 0.4031018912792206,
"rewards/format_reward": 1.0,
"step": 15
},
{
"all_correct": 0.0625,
"all_wrong": 0.0,
"completion_length": 79.59765625,
"epoch": 0.030245746691871456,
"grad_norm": 4.571581966102541,
"kl": 0.0162353515625,
"learning_rate": 1.9954890129071873e-06,
"loss": 0.0007,
"reward": 1.5757702589035034,
"reward_std": 0.3923387825489044,
"rewards/accuracy_reward": 0.5757702589035034,
"rewards/format_reward": 1.0,
"step": 16
},
{
"all_correct": 0.09375,
"all_wrong": 0.03125,
"completion_length": 82.7890625,
"epoch": 0.03213610586011342,
"grad_norm": 2.393421904457873,
"kl": 0.01361083984375,
"learning_rate": 1.9949080124455415e-06,
"loss": 0.0005,
"reward": 1.5704209804534912,
"reward_std": 0.3697139024734497,
"rewards/accuracy_reward": 0.5704209804534912,
"rewards/format_reward": 1.0,
"step": 17
},
{
"all_correct": 0.15625,
"all_wrong": 0.125,
"completion_length": 88.5234375,
"epoch": 0.034026465028355386,
"grad_norm": 2.8917838645260643,
"kl": 0.0155029296875,
"learning_rate": 1.9942919230596897e-06,
"loss": 0.0006,
"reward": 1.5118800401687622,
"reward_std": 0.3223443031311035,
"rewards/accuracy_reward": 0.5196925401687622,
"rewards/format_reward": 0.9921875,
"step": 18
},
{
"all_correct": 0.0625,
"all_wrong": 0.09375,
"completion_length": 84.2421875,
"epoch": 0.035916824196597356,
"grad_norm": 5.108847500395768,
"kl": 0.0186767578125,
"learning_rate": 1.9936407664781867e-06,
"loss": 0.0007,
"reward": 1.5046335458755493,
"reward_std": 0.3630630373954773,
"rewards/accuracy_reward": 0.5046335458755493,
"rewards/format_reward": 1.0,
"step": 19
},
{
"all_correct": 0.125,
"all_wrong": 0.09375,
"completion_length": 84.87890625,
"epoch": 0.03780718336483932,
"grad_norm": 2.1872918469896065,
"kl": 0.0181884765625,
"learning_rate": 1.992954565666356e-06,
"loss": 0.0007,
"reward": 1.5272603034973145,
"reward_std": 0.32879531383514404,
"rewards/accuracy_reward": 0.5350728034973145,
"rewards/format_reward": 0.9921875,
"step": 20
},
{
"all_correct": 0.09375,
"all_wrong": 0.0625,
"completion_length": 79.5859375,
"epoch": 0.03969754253308128,
"grad_norm": 3.3985079753937026,
"kl": 0.0186767578125,
"learning_rate": 1.9922333448254785e-06,
"loss": 0.0007,
"reward": 1.4413049221038818,
"reward_std": 0.3449176847934723,
"rewards/accuracy_reward": 0.44130486249923706,
"rewards/format_reward": 1.0,
"step": 21
},
{
"all_correct": 0.09375,
"all_wrong": 0.0625,
"completion_length": 87.44921875,
"epoch": 0.04158790170132325,
"grad_norm": 2.1124500662245302,
"kl": 0.0181884765625,
"learning_rate": 1.9914771293919394e-06,
"loss": 0.0007,
"reward": 1.5039958953857422,
"reward_std": 0.36595243215560913,
"rewards/accuracy_reward": 0.5039960145950317,
"rewards/format_reward": 1.0,
"step": 22
},
{
"all_correct": 0.125,
"all_wrong": 0.1875,
"completion_length": 90.96484375,
"epoch": 0.043478260869565216,
"grad_norm": 2.57988777327046,
"kl": 0.020263671875,
"learning_rate": 1.9906859460363304e-06,
"loss": 0.0008,
"reward": 1.406567931175232,
"reward_std": 0.32273876667022705,
"rewards/accuracy_reward": 0.41828668117523193,
"rewards/format_reward": 0.98828125,
"step": 23
},
{
"all_correct": 0.125,
"all_wrong": 0.0625,
"completion_length": 81.42578125,
"epoch": 0.045368620037807186,
"grad_norm": 2.877431959109021,
"kl": 0.0211181640625,
"learning_rate": 1.9898598226625114e-06,
"loss": 0.0008,
"reward": 1.5652744770050049,
"reward_std": 0.33840304613113403,
"rewards/accuracy_reward": 0.5652744770050049,
"rewards/format_reward": 1.0,
"step": 24
},
{
"all_correct": 0.0,
"all_wrong": 0.0625,
"completion_length": 82.98046875,
"epoch": 0.04725897920604915,
"grad_norm": 2.669500830918649,
"kl": 0.01953125,
"learning_rate": 1.9889987884066234e-06,
"loss": 0.0008,
"reward": 1.457775354385376,
"reward_std": 0.3964886963367462,
"rewards/accuracy_reward": 0.4577752947807312,
"rewards/format_reward": 1.0,
"step": 25
},
{
"all_correct": 0.03125,
"all_wrong": 0.125,
"completion_length": 90.69921875,
"epoch": 0.04914933837429111,
"grad_norm": 2.1403683676993577,
"kl": 0.01806640625,
"learning_rate": 1.9881028736360623e-06,
"loss": 0.0007,
"reward": 1.3968093395233154,
"reward_std": 0.3421282172203064,
"rewards/accuracy_reward": 0.3968093991279602,
"rewards/format_reward": 1.0,
"step": 26
},
{
"all_correct": 0.0625,
"all_wrong": 0.0625,
"completion_length": 86.796875,
"epoch": 0.05103969754253308,
"grad_norm": 2.435390719204286,
"kl": 0.0238037109375,
"learning_rate": 1.9871721099484077e-06,
"loss": 0.001,
"reward": 1.4553546905517578,
"reward_std": 0.3946911692619324,
"rewards/accuracy_reward": 0.4553546607494354,
"rewards/format_reward": 1.0,
"step": 27
},
{
"all_correct": 0.03125,
"all_wrong": 0.09375,
"completion_length": 89.609375,
"epoch": 0.052930056710775046,
"grad_norm": 2.8058570294434158,
"kl": 0.0213623046875,
"learning_rate": 1.98620653017031e-06,
"loss": 0.0009,
"reward": 1.4355332851409912,
"reward_std": 0.3729744553565979,
"rewards/accuracy_reward": 0.4394395351409912,
"rewards/format_reward": 0.99609375,
"step": 28
},
{
"all_correct": 0.09375,
"all_wrong": 0.0625,
"completion_length": 89.26953125,
"epoch": 0.054820415879017016,
"grad_norm": 2.5095592118103562,
"kl": 0.0234375,
"learning_rate": 1.9852061683563294e-06,
"loss": 0.0009,
"reward": 1.4201900959014893,
"reward_std": 0.3762331008911133,
"rewards/accuracy_reward": 0.4280025064945221,
"rewards/format_reward": 0.9921875,
"step": 29
},
{
"all_correct": 0.0,
"all_wrong": 0.125,
"completion_length": 89.44921875,
"epoch": 0.05671077504725898,
"grad_norm": 2.251874852478182,
"kl": 0.02587890625,
"learning_rate": 1.9841710597877382e-06,
"loss": 0.001,
"reward": 1.482748031616211,
"reward_std": 0.37411996722221375,
"rewards/accuracy_reward": 0.4827480912208557,
"rewards/format_reward": 1.0,
"step": 30
},
{
"all_correct": 0.09375,
"all_wrong": 0.125,
"completion_length": 93.671875,
"epoch": 0.05860113421550094,
"grad_norm": 3.0094971920246465,
"kl": 0.0240478515625,
"learning_rate": 1.9831012409712736e-06,
"loss": 0.001,
"reward": 1.5294721126556396,
"reward_std": 0.28737539052963257,
"rewards/accuracy_reward": 0.5294721126556396,
"rewards/format_reward": 1.0,
"step": 31
},
{
"all_correct": 0.125,
"all_wrong": 0.0625,
"completion_length": 89.1875,
"epoch": 0.06049149338374291,
"grad_norm": 2.085921014940215,
"kl": 0.02294921875,
"learning_rate": 1.981996749637853e-06,
"loss": 0.0009,
"reward": 1.4983799457550049,
"reward_std": 0.3365086317062378,
"rewards/accuracy_reward": 0.5100986957550049,
"rewards/format_reward": 0.98828125,
"step": 32
},
{
"all_correct": 0.0625,
"all_wrong": 0.09375,
"completion_length": 90.39453125,
"epoch": 0.062381852551984876,
"grad_norm": 2.622758297904014,
"kl": 0.0267333984375,
"learning_rate": 1.9808576247412406e-06,
"loss": 0.0011,
"reward": 1.457702875137329,
"reward_std": 0.397009938955307,
"rewards/accuracy_reward": 0.4577029049396515,
"rewards/format_reward": 1.0,
"step": 33
},
{
"all_correct": 0.09375,
"all_wrong": 0.09375,
"completion_length": 89.71875,
"epoch": 0.06427221172022685,
"grad_norm": 2.171724077831634,
"kl": 0.0255126953125,
"learning_rate": 1.979683906456676e-06,
"loss": 0.001,
"reward": 1.5610275268554688,
"reward_std": 0.3770396411418915,
"rewards/accuracy_reward": 0.5688400268554688,
"rewards/format_reward": 0.9921875,
"step": 34
},
{
"all_correct": 0.03125,
"all_wrong": 0.125,
"completion_length": 102.4375,
"epoch": 0.0661625708884688,
"grad_norm": 2.4062787127917553,
"kl": 0.0238037109375,
"learning_rate": 1.9784756361794553e-06,
"loss": 0.001,
"reward": 1.4750714302062988,
"reward_std": 0.39490264654159546,
"rewards/accuracy_reward": 0.49069640040397644,
"rewards/format_reward": 0.984375,
"step": 35
},
{
"all_correct": 0.0625,
"all_wrong": 0.09375,
"completion_length": 100.83203125,
"epoch": 0.06805293005671077,
"grad_norm": 1.923377019543097,
"kl": 0.0269775390625,
"learning_rate": 1.9772328565234715e-06,
"loss": 0.0011,
"reward": 1.453148603439331,
"reward_std": 0.3995356857776642,
"rewards/accuracy_reward": 0.46877366304397583,
"rewards/format_reward": 0.984375,
"step": 36
},
{
"all_correct": 0.125,
"all_wrong": 0.0625,
"completion_length": 93.76953125,
"epoch": 0.06994328922495274,
"grad_norm": 2.5852022147016065,
"kl": 0.033935546875,
"learning_rate": 1.9759556113197133e-06,
"loss": 0.0014,
"reward": 1.5378497838974,
"reward_std": 0.3647596836090088,
"rewards/accuracy_reward": 0.5378497838973999,
"rewards/format_reward": 1.0,
"step": 37
},
{
"all_correct": 0.09375,
"all_wrong": 0.03125,
"completion_length": 93.65625,
"epoch": 0.07183364839319471,
"grad_norm": 3.0899596065894617,
"kl": 0.0311279296875,
"learning_rate": 1.974643945614717e-06,
"loss": 0.0012,
"reward": 1.5544549226760864,
"reward_std": 0.38801953196525574,
"rewards/accuracy_reward": 0.5661737322807312,
"rewards/format_reward": 0.98828125,
"step": 38
},
{
"all_correct": 0.09375,
"all_wrong": 0.15625,
"completion_length": 87.58203125,
"epoch": 0.07372400756143667,
"grad_norm": 2.210050723375974,
"kl": 0.031982421875,
"learning_rate": 1.973297905668979e-06,
"loss": 0.0013,
"reward": 1.4015624523162842,
"reward_std": 0.32554763555526733,
"rewards/accuracy_reward": 0.40546876192092896,
"rewards/format_reward": 0.99609375,
"step": 39
},
{
"all_correct": 0.0,
"all_wrong": 0.25,
"completion_length": 96.12109375,
"epoch": 0.07561436672967864,
"grad_norm": 3.5246311894519393,
"kl": 0.029052734375,
"learning_rate": 1.971917538955324e-06,
"loss": 0.0012,
"reward": 1.3508296012878418,
"reward_std": 0.28737950325012207,
"rewards/accuracy_reward": 0.350829541683197,
"rewards/format_reward": 1.0,
"step": 40
},
{
"all_correct": 0.1875,
"all_wrong": 0.0625,
"completion_length": 89.08984375,
"epoch": 0.07750472589792061,
"grad_norm": 2.4498338590518633,
"kl": 0.03466796875,
"learning_rate": 1.9705028941572306e-06,
"loss": 0.0014,
"reward": 1.5412009954452515,
"reward_std": 0.28274005651474,
"rewards/accuracy_reward": 0.5412009358406067,
"rewards/format_reward": 1.0,
"step": 41
},
{
"all_correct": 0.09375,
"all_wrong": 0.09375,
"completion_length": 92.94921875,
"epoch": 0.07939508506616257,
"grad_norm": 2.3904089625119402,
"kl": 0.03369140625,
"learning_rate": 1.9690540211671144e-06,
"loss": 0.0013,
"reward": 1.5241187810897827,
"reward_std": 0.3563092350959778,
"rewards/accuracy_reward": 0.5241187810897827,
"rewards/format_reward": 1.0,
"step": 42
},
{
"all_correct": 0.09375,
"all_wrong": 0.09375,
"completion_length": 94.828125,
"epoch": 0.08128544423440454,
"grad_norm": 1.9707044514802314,
"kl": 0.036865234375,
"learning_rate": 1.9675709710845685e-06,
"loss": 0.0015,
"reward": 1.506849765777588,
"reward_std": 0.37673529982566833,
"rewards/accuracy_reward": 0.5263809561729431,
"rewards/format_reward": 0.98046875,
"step": 43
},
{
"all_correct": 0.09375,
"all_wrong": 0.125,
"completion_length": 95.9140625,
"epoch": 0.0831758034026465,
"grad_norm": 2.2780949740065326,
"kl": 0.037109375,
"learning_rate": 1.966053796214561e-06,
"loss": 0.0015,
"reward": 1.53428316116333,
"reward_std": 0.30121245980262756,
"rewards/accuracy_reward": 0.5342831015586853,
"rewards/format_reward": 1.0,
"step": 44
},
{
"all_correct": 0.09375,
"all_wrong": 0.0,
"completion_length": 93.00390625,
"epoch": 0.08506616257088846,
"grad_norm": 2.5387336771715483,
"kl": 0.033935546875,
"learning_rate": 1.9645025500655906e-06,
"loss": 0.0014,
"reward": 1.6001973152160645,
"reward_std": 0.3812434673309326,
"rewards/accuracy_reward": 0.6001973152160645,
"rewards/format_reward": 1.0,
"step": 45
},
{
"all_correct": 0.15625,
"all_wrong": 0.125,
"completion_length": 95.98046875,
"epoch": 0.08695652173913043,
"grad_norm": 1.9377592683430596,
"kl": 0.0341796875,
"learning_rate": 1.9629172873477994e-06,
"loss": 0.0014,
"reward": 1.5135773420333862,
"reward_std": 0.31737208366394043,
"rewards/accuracy_reward": 0.5174835920333862,
"rewards/format_reward": 0.99609375,
"step": 46
},
{
"all_correct": 0.03125,
"all_wrong": 0.125,
"completion_length": 100.23828125,
"epoch": 0.0888468809073724,
"grad_norm": 2.869116193500619,
"kl": 0.036865234375,
"learning_rate": 1.9612980639710424e-06,
"loss": 0.0015,
"reward": 1.4429457187652588,
"reward_std": 0.3532693684101105,
"rewards/accuracy_reward": 0.4468519687652588,
"rewards/format_reward": 0.99609375,
"step": 47
},
{
"all_correct": 0.09375,
"all_wrong": 0.1875,
"completion_length": 92.6640625,
"epoch": 0.09073724007561437,
"grad_norm": 2.613636479936795,
"kl": 0.0361328125,
"learning_rate": 1.959644937042918e-06,
"loss": 0.0014,
"reward": 1.4310598373413086,
"reward_std": 0.3048115074634552,
"rewards/accuracy_reward": 0.43887221813201904,
"rewards/format_reward": 0.9921875,
"step": 48
},
{
"all_correct": 0.0625,
"all_wrong": 0.0625,
"completion_length": 94.671875,
"epoch": 0.09262759924385633,
"grad_norm": 2.3402731827939296,
"kl": 0.0361328125,
"learning_rate": 1.957957964866751e-06,
"loss": 0.0014,
"reward": 1.512986660003662,
"reward_std": 0.3828140199184418,
"rewards/accuracy_reward": 0.5168927907943726,
"rewards/format_reward": 0.99609375,
"step": 49
},
{
"all_correct": 0.09375,
"all_wrong": 0.0625,
"completion_length": 98.546875,
"epoch": 0.0945179584120983,
"grad_norm": 2.244807148397354,
"kl": 0.03857421875,
"learning_rate": 1.956237206939538e-06,
"loss": 0.0015,
"reward": 1.47536301612854,
"reward_std": 0.400837242603302,
"rewards/accuracy_reward": 0.47926920652389526,
"rewards/format_reward": 0.99609375,
"step": 50
},
{
"all_correct": 0.125,
"all_wrong": 0.09375,
"completion_length": 90.328125,
"epoch": 0.09640831758034027,
"grad_norm": 2.3971385643088348,
"kl": 0.04248046875,
"learning_rate": 1.9544827239498494e-06,
"loss": 0.0017,
"reward": 1.5585781335830688,
"reward_std": 0.37756532430648804,
"rewards/accuracy_reward": 0.5624843835830688,
"rewards/format_reward": 0.99609375,
"step": 51
},
{
"all_correct": 0.15625,
"all_wrong": 0.09375,
"completion_length": 84.60546875,
"epoch": 0.09829867674858223,
"grad_norm": 2.2709012051170987,
"kl": 0.038330078125,
"learning_rate": 1.952694577775688e-06,
"loss": 0.0015,
"reward": 1.4947199821472168,
"reward_std": 0.28436505794525146,
"rewards/accuracy_reward": 0.4947200417518616,
"rewards/format_reward": 1.0,
"step": 52
},
{
"all_correct": 0.1875,
"all_wrong": 0.15625,
"completion_length": 90.6953125,
"epoch": 0.1001890359168242,
"grad_norm": 2.3353621567696172,
"kl": 0.040283203125,
"learning_rate": 1.950872831482306e-06,
"loss": 0.0016,
"reward": 1.5374271869659424,
"reward_std": 0.2822425663471222,
"rewards/accuracy_reward": 0.5374271869659424,
"rewards/format_reward": 1.0,
"step": 53
},
{
"all_correct": 0.15625,
"all_wrong": 0.0625,
"completion_length": 85.8828125,
"epoch": 0.10207939508506617,
"grad_norm": 2.686467954946551,
"kl": 0.044189453125,
"learning_rate": 1.949017549319983e-06,
"loss": 0.0018,
"reward": 1.546825647354126,
"reward_std": 0.2987156808376312,
"rewards/accuracy_reward": 0.546825647354126,
"rewards/format_reward": 1.0,
"step": 54
},
{
"all_correct": 0.09375,
"all_wrong": 0.0625,
"completion_length": 98.734375,
"epoch": 0.10396975425330812,
"grad_norm": 1.9504297926987135,
"kl": 0.0390625,
"learning_rate": 1.947128796721759e-06,
"loss": 0.0016,
"reward": 1.6038429737091064,
"reward_std": 0.3754524886608124,
"rewards/accuracy_reward": 0.6077491641044617,
"rewards/format_reward": 0.99609375,
"step": 55
},
{
"all_correct": 0.125,
"all_wrong": 0.09375,
"completion_length": 93.5859375,
"epoch": 0.10586011342155009,
"grad_norm": 3.287003319255905,
"kl": 0.041748046875,
"learning_rate": 1.9452066403011253e-06,
"loss": 0.0017,
"reward": 1.4696145057678223,
"reward_std": 0.34197893738746643,
"rewards/accuracy_reward": 0.47352084517478943,
"rewards/format_reward": 0.99609375,
"step": 56
},
{
"all_correct": 0.21875,
"all_wrong": 0.09375,
"completion_length": 94.36328125,
"epoch": 0.10775047258979206,
"grad_norm": 3.2939637852247685,
"kl": 0.04345703125,
"learning_rate": 1.9432511478496766e-06,
"loss": 0.0017,
"reward": 1.5146770477294922,
"reward_std": 0.26444536447525024,
"rewards/accuracy_reward": 0.514677107334137,
"rewards/format_reward": 1.0,
"step": 57
},
{
"all_correct": 0.1875,
"all_wrong": 0.15625,
"completion_length": 92.765625,
"epoch": 0.10964083175803403,
"grad_norm": 2.1754630667582413,
"kl": 0.046630859375,
"learning_rate": 1.9412623883347206e-06,
"loss": 0.0019,
"reward": 1.5093607902526855,
"reward_std": 0.26719433069229126,
"rewards/accuracy_reward": 0.5171732902526855,
"rewards/format_reward": 0.9921875,
"step": 58
},
{
"all_correct": 0.125,
"all_wrong": 0.09375,
"completion_length": 89.7890625,
"epoch": 0.11153119092627599,
"grad_norm": 3.158327945972579,
"kl": 0.042236328125,
"learning_rate": 1.939240431896844e-06,
"loss": 0.0017,
"reward": 1.4645804166793823,
"reward_std": 0.333609402179718,
"rewards/accuracy_reward": 0.46848660707473755,
"rewards/format_reward": 0.99609375,
"step": 59
},
{
"all_correct": 0.125,
"all_wrong": 0.1875,
"completion_length": 92.24609375,
"epoch": 0.11342155009451796,
"grad_norm": 4.067057374654062,
"kl": 0.04736328125,
"learning_rate": 1.937185349847439e-06,
"loss": 0.0019,
"reward": 1.4154318571090698,
"reward_std": 0.28358522057533264,
"rewards/accuracy_reward": 0.41543182730674744,
"rewards/format_reward": 1.0,
"step": 60
},
{
"all_correct": 0.1875,
"all_wrong": 0.21875,
"completion_length": 85.125,
"epoch": 0.11531190926275993,
"grad_norm": 10.445979366082359,
"kl": 0.048095703125,
"learning_rate": 1.9350972146661903e-06,
"loss": 0.0019,
"reward": 1.5046515464782715,
"reward_std": 0.24698607623577118,
"rewards/accuracy_reward": 0.5085577964782715,
"rewards/format_reward": 0.99609375,
"step": 61
},
{
"all_correct": 0.15625,
"all_wrong": 0.1875,
"completion_length": 87.96484375,
"epoch": 0.11720226843100189,
"grad_norm": 4.779328158469133,
"kl": 0.04736328125,
"learning_rate": 1.9329760999985165e-06,
"loss": 0.0019,
"reward": 1.4373040199279785,
"reward_std": 0.29254239797592163,
"rewards/accuracy_reward": 0.4373040795326233,
"rewards/format_reward": 1.0,
"step": 62
},
{
"all_correct": 0.1875,
"all_wrong": 0.125,
"completion_length": 85.94140625,
"epoch": 0.11909262759924386,
"grad_norm": 2.355263239902568,
"kl": 0.052490234375,
"learning_rate": 1.9308220806529737e-06,
"loss": 0.0021,
"reward": 1.4870660305023193,
"reward_std": 0.3119330108165741,
"rewards/accuracy_reward": 0.49097222089767456,
"rewards/format_reward": 0.99609375,
"step": 63
},
{
"all_correct": 0.1875,
"all_wrong": 0.09375,
"completion_length": 82.74609375,
"epoch": 0.12098298676748583,
"grad_norm": 2.4735239418793604,
"kl": 0.050048828125,
"learning_rate": 1.9286352325986163e-06,
"loss": 0.002,
"reward": 1.5674299001693726,
"reward_std": 0.2720338702201843,
"rewards/accuracy_reward": 0.5791486501693726,
"rewards/format_reward": 0.98828125,
"step": 64
},
{
"all_correct": 0.15625,
"all_wrong": 0.09375,
"completion_length": 84.96875,
"epoch": 0.12287334593572778,
"grad_norm": 3.2309139052967457,
"kl": 0.05517578125,
"learning_rate": 1.9264156329623195e-06,
"loss": 0.0022,
"reward": 1.5520949363708496,
"reward_std": 0.2912652790546417,
"rewards/accuracy_reward": 0.5520949363708496,
"rewards/format_reward": 1.0,
"step": 65
},
{
"all_correct": 0.21875,
"all_wrong": 0.125,
"completion_length": 86.56640625,
"epoch": 0.12476370510396975,
"grad_norm": 3.426837800551729,
"kl": 0.05126953125,
"learning_rate": 1.9241633600260575e-06,
"loss": 0.0021,
"reward": 1.5658715963363647,
"reward_std": 0.2718903720378876,
"rewards/accuracy_reward": 0.5658715963363647,
"rewards/format_reward": 1.0,
"step": 66
},
{
"all_correct": 0.15625,
"all_wrong": 0.125,
"completion_length": 85.89453125,
"epoch": 0.1266540642722117,
"grad_norm": 5.040208361814542,
"kl": 0.046630859375,
"learning_rate": 1.921878493224143e-06,
"loss": 0.0019,
"reward": 1.4924273490905762,
"reward_std": 0.2940727174282074,
"rewards/accuracy_reward": 0.49633359909057617,
"rewards/format_reward": 0.99609375,
"step": 67
},
{
"all_correct": 0.3125,
"all_wrong": 0.125,
"completion_length": 78.671875,
"epoch": 0.1285444234404537,
"grad_norm": 2.2476945314714643,
"kl": 0.050048828125,
"learning_rate": 1.9195611131404267e-06,
"loss": 0.002,
"reward": 1.5983318090438843,
"reward_std": 0.21439965069293976,
"rewards/accuracy_reward": 0.5983318090438843,
"rewards/format_reward": 1.0,
"step": 68
},
{
"all_correct": 0.15625,
"all_wrong": 0.25,
"completion_length": 87.28515625,
"epoch": 0.13043478260869565,
"grad_norm": 1.8603814885894778,
"kl": 0.043701171875,
"learning_rate": 1.9172113015054528e-06,
"loss": 0.0017,
"reward": 1.3991045951843262,
"reward_std": 0.22454139590263367,
"rewards/accuracy_reward": 0.4030107259750366,
"rewards/format_reward": 0.99609375,
"step": 69
},
{
"all_correct": 0.1875,
"all_wrong": 0.09375,
"completion_length": 85.96484375,
"epoch": 0.1323251417769376,
"grad_norm": 3.4308816795426536,
"kl": 0.0537109375,
"learning_rate": 1.9148291411935796e-06,
"loss": 0.0022,
"reward": 1.5655429363250732,
"reward_std": 0.3076818287372589,
"rewards/accuracy_reward": 0.5655430555343628,
"rewards/format_reward": 1.0,
"step": 70
},
{
"all_correct": 0.15625,
"all_wrong": 0.09375,
"completion_length": 91.11328125,
"epoch": 0.1342155009451796,
"grad_norm": 1.959995440169171,
"kl": 0.0517578125,
"learning_rate": 1.9124147162200534e-06,
"loss": 0.0021,
"reward": 1.4995213747024536,
"reward_std": 0.3407973051071167,
"rewards/accuracy_reward": 0.5073338747024536,
"rewards/format_reward": 0.9921875,
"step": 71
},
{
"all_correct": 0.3125,
"all_wrong": 0.15625,
"completion_length": 79.31640625,
"epoch": 0.13610586011342155,
"grad_norm": 1.8897107714275212,
"kl": 0.053466796875,
"learning_rate": 1.9099681117380486e-06,
"loss": 0.0021,
"reward": 1.634920597076416,
"reward_std": 0.21926391124725342,
"rewards/accuracy_reward": 0.6349206566810608,
"rewards/format_reward": 1.0,
"step": 72
},
{
"all_correct": 0.4375,
"all_wrong": 0.09375,
"completion_length": 83.95703125,
"epoch": 0.13799621928166353,
"grad_norm": 1.6792021771564014,
"kl": 0.052490234375,
"learning_rate": 1.907489414035662e-06,
"loss": 0.0021,
"reward": 1.7418066263198853,
"reward_std": 0.20898818969726562,
"rewards/accuracy_reward": 0.7457128763198853,
"rewards/format_reward": 0.99609375,
"step": 73
},
{
"all_correct": 0.21875,
"all_wrong": 0.25,
"completion_length": 82.68359375,
"epoch": 0.13988657844990549,
"grad_norm": 18.661291363199766,
"kl": 0.0556640625,
"learning_rate": 1.9049787105328714e-06,
"loss": 0.0022,
"reward": 1.5272233486175537,
"reward_std": 0.22412577271461487,
"rewards/accuracy_reward": 0.5311296582221985,
"rewards/format_reward": 0.99609375,
"step": 74
},
{
"all_correct": 0.21875,
"all_wrong": 0.125,
"completion_length": 86.87890625,
"epoch": 0.14177693761814744,
"grad_norm": 1.968107409229428,
"kl": 0.05517578125,
"learning_rate": 1.9024360897784505e-06,
"loss": 0.0022,
"reward": 1.53652024269104,
"reward_std": 0.2906913161277771,
"rewards/accuracy_reward": 0.5443326234817505,
"rewards/format_reward": 0.9921875,
"step": 75
},
{
"all_correct": 0.25,
"all_wrong": 0.15625,
"completion_length": 89.5546875,
"epoch": 0.14366729678638943,
"grad_norm": 1.9463441104137431,
"kl": 0.05322265625,
"learning_rate": 1.8998616414468477e-06,
"loss": 0.0021,
"reward": 1.5365304946899414,
"reward_std": 0.22610870003700256,
"rewards/accuracy_reward": 0.5365304350852966,
"rewards/format_reward": 1.0,
"step": 76
},
{
"all_correct": 0.1875,
"all_wrong": 0.1875,
"completion_length": 89.98046875,
"epoch": 0.14555765595463138,
"grad_norm": 4.159641747444261,
"kl": 0.0517578125,
"learning_rate": 1.897255456335022e-06,
"loss": 0.0021,
"reward": 1.530354619026184,
"reward_std": 0.2696187496185303,
"rewards/accuracy_reward": 0.5303546190261841,
"rewards/format_reward": 1.0,
"step": 77
},
{
"all_correct": 0.28125,
"all_wrong": 0.125,
"completion_length": 86.078125,
"epoch": 0.14744801512287334,
"grad_norm": 1.8829618370755878,
"kl": 0.054443359375,
"learning_rate": 1.894617626359242e-06,
"loss": 0.0022,
"reward": 1.6262216567993164,
"reward_std": 0.2085573971271515,
"rewards/accuracy_reward": 0.6262217164039612,
"rewards/format_reward": 1.0,
"step": 78
},
{
"all_correct": 0.25,
"all_wrong": 0.125,
"completion_length": 92.19921875,
"epoch": 0.14933837429111532,
"grad_norm": 3.1281418285199996,
"kl": 0.047607421875,
"learning_rate": 1.8919482445518434e-06,
"loss": 0.0019,
"reward": 1.5566154718399048,
"reward_std": 0.2710142731666565,
"rewards/accuracy_reward": 0.5566154718399048,
"rewards/format_reward": 1.0,
"step": 79
},
{
"all_correct": 0.3125,
"all_wrong": 0.1875,
"completion_length": 97.9453125,
"epoch": 0.15122873345935728,
"grad_norm": 1.8453083650465387,
"kl": 0.0498046875,
"learning_rate": 1.8892474050579476e-06,
"loss": 0.002,
"reward": 1.526172399520874,
"reward_std": 0.15693798661231995,
"rewards/accuracy_reward": 0.5261724591255188,
"rewards/format_reward": 1.0,
"step": 80
},
{
"all_correct": 0.34375,
"all_wrong": 0.1875,
"completion_length": 96.15625,
"epoch": 0.15311909262759923,
"grad_norm": 1.618450951037405,
"kl": 0.052734375,
"learning_rate": 1.8865152031321425e-06,
"loss": 0.0021,
"reward": 1.5617804527282715,
"reward_std": 0.20894506573677063,
"rewards/accuracy_reward": 0.5656867027282715,
"rewards/format_reward": 0.99609375,
"step": 81
},
{
"all_correct": 0.25,
"all_wrong": 0.21875,
"completion_length": 93.93359375,
"epoch": 0.15500945179584122,
"grad_norm": 3.3619693904490346,
"kl": 0.049560546875,
"learning_rate": 1.8837517351351212e-06,
"loss": 0.002,
"reward": 1.502871036529541,
"reward_std": 0.2329874485731125,
"rewards/accuracy_reward": 0.5028710961341858,
"rewards/format_reward": 1.0,
"step": 82
},
{
"all_correct": 0.25,
"all_wrong": 0.09375,
"completion_length": 100.8671875,
"epoch": 0.15689981096408318,
"grad_norm": 2.0482069336725717,
"kl": 0.05029296875,
"learning_rate": 1.8809570985302861e-06,
"loss": 0.002,
"reward": 1.5919384956359863,
"reward_std": 0.2818424105644226,
"rewards/accuracy_reward": 0.5919384956359863,
"rewards/format_reward": 1.0,
"step": 83
},
{
"all_correct": 0.3125,
"all_wrong": 0.1875,
"completion_length": 93.40234375,
"epoch": 0.15879017013232513,
"grad_norm": 1.5003724294387097,
"kl": 0.0498046875,
"learning_rate": 1.8781313918803083e-06,
"loss": 0.002,
"reward": 1.5504703521728516,
"reward_std": 0.22670012712478638,
"rewards/accuracy_reward": 0.5739079117774963,
"rewards/format_reward": 0.9765625,
"step": 84
},
{
"all_correct": 0.375,
"all_wrong": 0.1875,
"completion_length": 87.7421875,
"epoch": 0.16068052930056712,
"grad_norm": 4.602359933501162,
"kl": 0.054443359375,
"learning_rate": 1.8752747148436542e-06,
"loss": 0.0022,
"reward": 1.5955908298492432,
"reward_std": 0.164137065410614,
"rewards/accuracy_reward": 0.5955909490585327,
"rewards/format_reward": 1.0,
"step": 85
},
{
"all_correct": 0.25,
"all_wrong": 0.25,
"completion_length": 92.83203125,
"epoch": 0.16257088846880907,
"grad_norm": 1.903155543992341,
"kl": 0.052490234375,
"learning_rate": 1.8723871681710694e-06,
"loss": 0.0021,
"reward": 1.4634450674057007,
"reward_std": 0.20022635161876678,
"rewards/accuracy_reward": 0.4634450674057007,
"rewards/format_reward": 1.0,
"step": 86
},
{
"all_correct": 0.28125,
"all_wrong": 0.25,
"completion_length": 91.84765625,
"epoch": 0.16446124763705103,
"grad_norm": 1.6427736050864759,
"kl": 0.06640625,
"learning_rate": 1.8694688537020258e-06,
"loss": 0.0027,
"reward": 1.4603149890899658,
"reward_std": 0.1893424689769745,
"rewards/accuracy_reward": 0.468127578496933,
"rewards/format_reward": 0.9921875,
"step": 87
},
{
"all_correct": 0.3125,
"all_wrong": 0.28125,
"completion_length": 98.12890625,
"epoch": 0.166351606805293,
"grad_norm": 1.2849844281526226,
"kl": 0.053955078125,
"learning_rate": 1.866519874361129e-06,
"loss": 0.0022,
"reward": 1.493840217590332,
"reward_std": 0.1990819126367569,
"rewards/accuracy_reward": 0.5055589079856873,
"rewards/format_reward": 0.98828125,
"step": 88
},
{
"all_correct": 0.28125,
"all_wrong": 0.03125,
"completion_length": 97.5546875,
"epoch": 0.16824196597353497,
"grad_norm": 3.36212198927166,
"kl": 0.051513671875,
"learning_rate": 1.8635403341544897e-06,
"loss": 0.0021,
"reward": 1.6382396221160889,
"reward_std": 0.29065367579460144,
"rewards/accuracy_reward": 0.6421457529067993,
"rewards/format_reward": 0.99609375,
"step": 89
},
{
"all_correct": 0.25,
"all_wrong": 0.21875,
"completion_length": 93.35546875,
"epoch": 0.17013232514177692,
"grad_norm": 1.964853567576083,
"kl": 0.052001953125,
"learning_rate": 1.8605303381660542e-06,
"loss": 0.0021,
"reward": 1.4816901683807373,
"reward_std": 0.2411368191242218,
"rewards/accuracy_reward": 0.48559650778770447,
"rewards/format_reward": 0.99609375,
"step": 90
},
{
"all_correct": 0.34375,
"all_wrong": 0.15625,
"completion_length": 97.84375,
"epoch": 0.1720226843100189,
"grad_norm": 1.7427774741536497,
"kl": 0.048583984375,
"learning_rate": 1.8574899925538995e-06,
"loss": 0.0019,
"reward": 1.594164490699768,
"reward_std": 0.22482213377952576,
"rewards/accuracy_reward": 0.6176020503044128,
"rewards/format_reward": 0.9765625,
"step": 91
},
{
"all_correct": 0.375,
"all_wrong": 0.25,
"completion_length": 96.33203125,
"epoch": 0.17391304347826086,
"grad_norm": 1.178350633076914,
"kl": 0.0478515625,
"learning_rate": 1.8544194045464886e-06,
"loss": 0.0019,
"reward": 1.570425271987915,
"reward_std": 0.15437397360801697,
"rewards/accuracy_reward": 0.5704251527786255,
"rewards/format_reward": 1.0,
"step": 92
},
{
"all_correct": 0.25,
"all_wrong": 0.25,
"completion_length": 96.08984375,
"epoch": 0.17580340264650285,
"grad_norm": 1.961774060919074,
"kl": 0.050048828125,
"learning_rate": 1.8513186824388878e-06,
"loss": 0.002,
"reward": 1.4320415258407593,
"reward_std": 0.18912720680236816,
"rewards/accuracy_reward": 0.44766655564308167,
"rewards/format_reward": 0.984375,
"step": 93
},
{
"all_correct": 0.3125,
"all_wrong": 0.25,
"completion_length": 99.69921875,
"epoch": 0.1776937618147448,
"grad_norm": 1.5015570744898201,
"kl": 0.046630859375,
"learning_rate": 1.8481879355889493e-06,
"loss": 0.0019,
"reward": 1.5381855964660645,
"reward_std": 0.2091609686613083,
"rewards/accuracy_reward": 0.5459980964660645,
"rewards/format_reward": 0.9921875,
"step": 94
},
{
"all_correct": 0.25,
"all_wrong": 0.34375,
"completion_length": 93.59765625,
"epoch": 0.17958412098298676,
"grad_norm": 1.5238358848316345,
"kl": 0.0576171875,
"learning_rate": 1.8450272744134533e-06,
"loss": 0.0023,
"reward": 1.4812531471252441,
"reward_std": 0.15344488620758057,
"rewards/accuracy_reward": 0.48125314712524414,
"rewards/format_reward": 1.0,
"step": 95
},
{
"all_correct": 0.21875,
"all_wrong": 0.3125,
"completion_length": 101.0390625,
"epoch": 0.18147448015122875,
"grad_norm": 1.3836535628215803,
"kl": 0.053955078125,
"learning_rate": 1.8418368103842122e-06,
"loss": 0.0022,
"reward": 1.4727139472961426,
"reward_std": 0.20200154185295105,
"rewards/accuracy_reward": 0.4766201078891754,
"rewards/format_reward": 0.99609375,
"step": 96
},
{
"all_correct": 0.375,
"all_wrong": 0.125,
"completion_length": 100.37109375,
"epoch": 0.1833648393194707,
"grad_norm": 4.036599697785966,
"kl": 0.04541015625,
"learning_rate": 1.8386166560241431e-06,
"loss": 0.0018,
"reward": 1.5775189399719238,
"reward_std": 0.2264866977930069,
"rewards/accuracy_reward": 0.5775189399719238,
"rewards/format_reward": 1.0,
"step": 97
},
{
"all_correct": 0.3125,
"all_wrong": 0.15625,
"completion_length": 91.52734375,
"epoch": 0.18525519848771266,
"grad_norm": 2.004043176233875,
"kl": 0.048095703125,
"learning_rate": 1.835366924903295e-06,
"loss": 0.0019,
"reward": 1.612939476966858,
"reward_std": 0.20541028678417206,
"rewards/accuracy_reward": 0.6129394769668579,
"rewards/format_reward": 1.0,
"step": 98
},
{
"all_correct": 0.46875,
"all_wrong": 0.1875,
"completion_length": 86.5625,
"epoch": 0.18714555765595464,
"grad_norm": 1.4852739381285651,
"kl": 0.04736328125,
"learning_rate": 1.8320877316348453e-06,
"loss": 0.0019,
"reward": 1.6532118320465088,
"reward_std": 0.14846470952033997,
"rewards/accuracy_reward": 0.6532118320465088,
"rewards/format_reward": 1.0,
"step": 99
},
{
"all_correct": 0.15625,
"all_wrong": 0.125,
"completion_length": 99.05078125,
"epoch": 0.1890359168241966,
"grad_norm": 4.076356880937712,
"kl": 0.043701171875,
"learning_rate": 1.8287791918710584e-06,
"loss": 0.0017,
"reward": 1.478124976158142,
"reward_std": 0.32335546612739563,
"rewards/accuracy_reward": 0.4859375059604645,
"rewards/format_reward": 0.9921875,
"step": 100
},
{
"all_correct": 0.28125,
"all_wrong": 0.03125,
"completion_length": 91.46875,
"epoch": 0.19092627599243855,
"grad_norm": 3.6383673386285675,
"kl": 0.04296875,
"learning_rate": 1.8254414222992057e-06,
"loss": 0.0017,
"reward": 1.6602026224136353,
"reward_std": 0.30003082752227783,
"rewards/accuracy_reward": 0.6602025628089905,
"rewards/format_reward": 1.0,
"step": 101
},
{
"all_correct": 0.25,
"all_wrong": 0.0625,
"completion_length": 90.078125,
"epoch": 0.19281663516068054,
"grad_norm": 2.071351404946499,
"kl": 0.04833984375,
"learning_rate": 1.8220745406374495e-06,
"loss": 0.0019,
"reward": 1.6283621788024902,
"reward_std": 0.2751215100288391,
"rewards/accuracy_reward": 0.6283620595932007,
"rewards/format_reward": 1.0,
"step": 102
},
{
"all_correct": 0.21875,
"all_wrong": 0.1875,
"completion_length": 97.75390625,
"epoch": 0.1947069943289225,
"grad_norm": 1.97407172472788,
"kl": 0.044921875,
"learning_rate": 1.8186786656306934e-06,
"loss": 0.0018,
"reward": 1.5005707740783691,
"reward_std": 0.2729160189628601,
"rewards/accuracy_reward": 0.5083833336830139,
"rewards/format_reward": 0.9921875,
"step": 103
},
{
"all_correct": 0.21875,
"all_wrong": 0.125,
"completion_length": 90.484375,
"epoch": 0.19659735349716445,
"grad_norm": 2.1911504081514734,
"kl": 0.04345703125,
"learning_rate": 1.8152539170463922e-06,
"loss": 0.0017,
"reward": 1.5098209381103516,
"reward_std": 0.24280044436454773,
"rewards/accuracy_reward": 0.5137272477149963,
"rewards/format_reward": 0.99609375,
"step": 104
},
{
"all_correct": 0.375,
"all_wrong": 0.0625,
"completion_length": 91.6875,
"epoch": 0.19848771266540643,
"grad_norm": 2.1758066744800924,
"kl": 0.048583984375,
"learning_rate": 1.8118004156703295e-06,
"loss": 0.0019,
"reward": 1.666813611984253,
"reward_std": 0.2204323410987854,
"rewards/accuracy_reward": 0.6668134927749634,
"rewards/format_reward": 1.0,
"step": 105
},
{
"all_correct": 0.3125,
"all_wrong": 0.15625,
"completion_length": 85.43359375,
"epoch": 0.2003780718336484,
"grad_norm": 2.2920515987360774,
"kl": 0.048828125,
"learning_rate": 1.808318283302356e-06,
"loss": 0.002,
"reward": 1.6315945386886597,
"reward_std": 0.17060711979866028,
"rewards/accuracy_reward": 0.6315945386886597,
"rewards/format_reward": 1.0,
"step": 106
},
{
"all_correct": 0.40625,
"all_wrong": 0.09375,
"completion_length": 92.27734375,
"epoch": 0.20226843100189035,
"grad_norm": 1.6468108640255814,
"kl": 0.048583984375,
"learning_rate": 1.8048076427520956e-06,
"loss": 0.0019,
"reward": 1.6242039203643799,
"reward_std": 0.2274279147386551,
"rewards/accuracy_reward": 0.6281101703643799,
"rewards/format_reward": 0.99609375,
"step": 107
},
{
"all_correct": 0.1875,
"all_wrong": 0.34375,
"completion_length": 90.59765625,
"epoch": 0.20415879017013233,
"grad_norm": 1.66499120414633,
"kl": 0.052978515625,
"learning_rate": 1.801268617834614e-06,
"loss": 0.0021,
"reward": 1.4206892251968384,
"reward_std": 0.15304508805274963,
"rewards/accuracy_reward": 0.4206892251968384,
"rewards/format_reward": 1.0,
"step": 108
},
{
"all_correct": 0.4375,
"all_wrong": 0.125,
"completion_length": 92.9296875,
"epoch": 0.2060491493383743,
"grad_norm": 1.3046512489220061,
"kl": 0.044677734375,
"learning_rate": 1.7977013333660498e-06,
"loss": 0.0018,
"reward": 1.637601613998413,
"reward_std": 0.17598497867584229,
"rewards/accuracy_reward": 0.6571328639984131,
"rewards/format_reward": 0.98046875,
"step": 109
},
{
"all_correct": 0.34375,
"all_wrong": 0.21875,
"completion_length": 87.953125,
"epoch": 0.20793950850661624,
"grad_norm": 1.9099290554016326,
"kl": 0.04541015625,
"learning_rate": 1.7941059151592145e-06,
"loss": 0.0018,
"reward": 1.579087734222412,
"reward_std": 0.17340317368507385,
"rewards/accuracy_reward": 0.5829939842224121,
"rewards/format_reward": 0.99609375,
"step": 110
},
{
"all_correct": 0.40625,
"all_wrong": 0.21875,
"completion_length": 93.56640625,
"epoch": 0.20982986767485823,
"grad_norm": 6.034052106072661,
"kl": 0.046142578125,
"learning_rate": 1.7904824900191555e-06,
"loss": 0.0018,
"reward": 1.5380107164382935,
"reward_std": 0.1255079060792923,
"rewards/accuracy_reward": 0.5380107164382935,
"rewards/format_reward": 1.0,
"step": 111
},
{
"all_correct": 0.34375,
"all_wrong": 0.25,
"completion_length": 92.42578125,
"epoch": 0.21172022684310018,
"grad_norm": 1.4881947802094542,
"kl": 0.050537109375,
"learning_rate": 1.786831185738682e-06,
"loss": 0.002,
"reward": 1.5942175388336182,
"reward_std": 0.12322144210338593,
"rewards/accuracy_reward": 0.5981237888336182,
"rewards/format_reward": 0.99609375,
"step": 112
},
{
"all_correct": 0.28125,
"all_wrong": 0.1875,
"completion_length": 89.22265625,
"epoch": 0.21361058601134217,
"grad_norm": 3.5770227394508876,
"kl": 0.04931640625,
"learning_rate": 1.7831521310938587e-06,
"loss": 0.002,
"reward": 1.5163066387176514,
"reward_std": 0.1974867880344391,
"rewards/accuracy_reward": 0.5163066387176514,
"rewards/format_reward": 1.0,
"step": 113
},
{
"all_correct": 0.40625,
"all_wrong": 0.09375,
"completion_length": 82.1328125,
"epoch": 0.21550094517958412,
"grad_norm": 2.212730582665482,
"kl": 0.0556640625,
"learning_rate": 1.7794454558394657e-06,
"loss": 0.0022,
"reward": 1.6943297386169434,
"reward_std": 0.18354207277297974,
"rewards/accuracy_reward": 0.6943297982215881,
"rewards/format_reward": 1.0,
"step": 114
},
{
"all_correct": 0.25,
"all_wrong": 0.21875,
"completion_length": 90.8203125,
"epoch": 0.21739130434782608,
"grad_norm": 4.361936414639028,
"kl": 0.056396484375,
"learning_rate": 1.7757112907044198e-06,
"loss": 0.0023,
"reward": 1.555484652519226,
"reward_std": 0.1996951699256897,
"rewards/accuracy_reward": 0.5554846525192261,
"rewards/format_reward": 1.0,
"step": 115
},
{
"all_correct": 0.3125,
"all_wrong": 0.25,
"completion_length": 87.26171875,
"epoch": 0.21928166351606806,
"grad_norm": 1.8180751664027779,
"kl": 0.0498046875,
"learning_rate": 1.7719497673871651e-06,
"loss": 0.002,
"reward": 1.4978692531585693,
"reward_std": 0.19965368509292603,
"rewards/accuracy_reward": 0.4978693127632141,
"rewards/format_reward": 1.0,
"step": 116
},
{
"all_correct": 0.4375,
"all_wrong": 0.15625,
"completion_length": 93.81640625,
"epoch": 0.22117202268431002,
"grad_norm": 1.8114429952435827,
"kl": 0.047119140625,
"learning_rate": 1.7681610185510283e-06,
"loss": 0.0019,
"reward": 1.6657145023345947,
"reward_std": 0.15193983912467957,
"rewards/accuracy_reward": 0.66962069272995,
"rewards/format_reward": 0.99609375,
"step": 117
},
{
"all_correct": 0.3125,
"all_wrong": 0.1875,
"completion_length": 88.76171875,
"epoch": 0.22306238185255198,
"grad_norm": 3.054644149610559,
"kl": 0.049072265625,
"learning_rate": 1.7643451778195394e-06,
"loss": 0.002,
"reward": 1.5918383598327637,
"reward_std": 0.20005470514297485,
"rewards/accuracy_reward": 0.5918383002281189,
"rewards/format_reward": 1.0,
"step": 118
},
{
"all_correct": 0.40625,
"all_wrong": 0.125,
"completion_length": 86.08203125,
"epoch": 0.22495274102079396,
"grad_norm": 2.092558777656238,
"kl": 0.054443359375,
"learning_rate": 1.7605023797717194e-06,
"loss": 0.0022,
"reward": 1.6277587413787842,
"reward_std": 0.19753384590148926,
"rewards/accuracy_reward": 0.6277587413787842,
"rewards/format_reward": 1.0,
"step": 119
},
{
"all_correct": 0.34375,
"all_wrong": 0.15625,
"completion_length": 90.5859375,
"epoch": 0.22684310018903592,
"grad_norm": 4.221955954568503,
"kl": 0.051025390625,
"learning_rate": 1.7566327599373336e-06,
"loss": 0.002,
"reward": 1.6072583198547363,
"reward_std": 0.1987745761871338,
"rewards/accuracy_reward": 0.6072583794593811,
"rewards/format_reward": 1.0,
"step": 120
},
{
"all_correct": 0.375,
"all_wrong": 0.125,
"completion_length": 90.83984375,
"epoch": 0.22873345935727787,
"grad_norm": 1.8343630634483405,
"kl": 0.0498046875,
"learning_rate": 1.7527364547921118e-06,
"loss": 0.002,
"reward": 1.6175568103790283,
"reward_std": 0.17853425443172455,
"rewards/accuracy_reward": 0.6175566911697388,
"rewards/format_reward": 1.0,
"step": 121
},
{
"all_correct": 0.46875,
"all_wrong": 0.125,
"completion_length": 84.7734375,
"epoch": 0.23062381852551986,
"grad_norm": 1.3989906719734049,
"kl": 0.0615234375,
"learning_rate": 1.748813601752935e-06,
"loss": 0.0025,
"reward": 1.7062667608261108,
"reward_std": 0.1319604218006134,
"rewards/accuracy_reward": 0.7062667012214661,
"rewards/format_reward": 1.0,
"step": 122
},
{
"all_correct": 0.34375,
"all_wrong": 0.15625,
"completion_length": 91.40234375,
"epoch": 0.23251417769376181,
"grad_norm": 1.9445947479760342,
"kl": 0.052978515625,
"learning_rate": 1.7448643391729886e-06,
"loss": 0.0021,
"reward": 1.573242425918579,
"reward_std": 0.2199619710445404,
"rewards/accuracy_reward": 0.5849611759185791,
"rewards/format_reward": 0.98828125,
"step": 123
},
{
"all_correct": 0.28125,
"all_wrong": 0.25,
"completion_length": 92.703125,
"epoch": 0.23440453686200377,
"grad_norm": 1.9821554924076896,
"kl": 0.047607421875,
"learning_rate": 1.7408888063368838e-06,
"loss": 0.0019,
"reward": 1.517979621887207,
"reward_std": 0.1772761046886444,
"rewards/accuracy_reward": 0.517979621887207,
"rewards/format_reward": 1.0,
"step": 124
},
{
"all_correct": 0.25,
"all_wrong": 0.1875,
"completion_length": 94.2421875,
"epoch": 0.23629489603024575,
"grad_norm": 1.7860361568391379,
"kl": 0.056640625,
"learning_rate": 1.7368871434557445e-06,
"loss": 0.0023,
"reward": 1.5142911672592163,
"reward_std": 0.24644066393375397,
"rewards/accuracy_reward": 0.5221036672592163,
"rewards/format_reward": 0.9921875,
"step": 125
},
{
"all_correct": 0.1875,
"all_wrong": 0.0,
"completion_length": 100.96484375,
"epoch": 0.2381852551984877,
"grad_norm": 2.329904142862184,
"kl": 0.046875,
"learning_rate": 1.7328594916622615e-06,
"loss": 0.0019,
"reward": 1.5455485582351685,
"reward_std": 0.35480332374572754,
"rewards/accuracy_reward": 0.5650798082351685,
"rewards/format_reward": 0.98046875,
"step": 126
},
{
"all_correct": 0.3125,
"all_wrong": 0.1875,
"completion_length": 85.55859375,
"epoch": 0.24007561436672967,
"grad_norm": 1.3180884643239135,
"kl": 0.045654296875,
"learning_rate": 1.7288059930057165e-06,
"loss": 0.0018,
"reward": 1.626103401184082,
"reward_std": 0.19042542576789856,
"rewards/accuracy_reward": 0.633915901184082,
"rewards/format_reward": 0.9921875,
"step": 127
},
{
"all_correct": 0.3125,
"all_wrong": 0.15625,
"completion_length": 96.4765625,
"epoch": 0.24196597353497165,
"grad_norm": 1.631849621742655,
"kl": 0.05322265625,
"learning_rate": 1.7247267904469723e-06,
"loss": 0.0021,
"reward": 1.5507967472076416,
"reward_std": 0.21631184220314026,
"rewards/accuracy_reward": 0.5586091876029968,
"rewards/format_reward": 0.9921875,
"step": 128
},
{
"all_correct": 0.375,
"all_wrong": 0.25,
"completion_length": 90.7265625,
"epoch": 0.2438563327032136,
"grad_norm": 2.019747189377822,
"kl": 0.054931640625,
"learning_rate": 1.7206220278534285e-06,
"loss": 0.0022,
"reward": 1.5207030773162842,
"reward_std": 0.14799641072750092,
"rewards/accuracy_reward": 0.528515636920929,
"rewards/format_reward": 0.9921875,
"step": 129
},
{
"all_correct": 0.4375,
"all_wrong": 0.15625,
"completion_length": 92.640625,
"epoch": 0.24574669187145556,
"grad_norm": 1.3384988128455364,
"kl": 0.054443359375,
"learning_rate": 1.7164918499939501e-06,
"loss": 0.0022,
"reward": 1.621284008026123,
"reward_std": 0.16135218739509583,
"rewards/accuracy_reward": 0.6330028176307678,
"rewards/format_reward": 0.98828125,
"step": 130
},
{
"all_correct": 0.375,
"all_wrong": 0.21875,
"completion_length": 93.1015625,
"epoch": 0.24763705103969755,
"grad_norm": 1.687141544181196,
"kl": 0.0576171875,
"learning_rate": 1.712336402533761e-06,
"loss": 0.0023,
"reward": 1.5703125,
"reward_std": 0.19187898933887482,
"rewards/accuracy_reward": 0.5703125,
"rewards/format_reward": 1.0,
"step": 131
},
{
"all_correct": 0.34375,
"all_wrong": 0.1875,
"completion_length": 92.66796875,
"epoch": 0.2495274102079395,
"grad_norm": 79.59867493675014,
"kl": 0.048828125,
"learning_rate": 1.7081558320293053e-06,
"loss": 0.002,
"reward": 1.590255618095398,
"reward_std": 0.1713361293077469,
"rewards/accuracy_reward": 0.5941617488861084,
"rewards/format_reward": 0.99609375,
"step": 132
},
{
"all_correct": 0.5,
"all_wrong": 0.09375,
"completion_length": 87.046875,
"epoch": 0.2514177693761815,
"grad_norm": 1.4501119642103175,
"kl": 0.04833984375,
"learning_rate": 1.7039502859230797e-06,
"loss": 0.0019,
"reward": 1.6924138069152832,
"reward_std": 0.1737845093011856,
"rewards/accuracy_reward": 0.6924139261245728,
"rewards/format_reward": 1.0,
"step": 133
},
{
"all_correct": 0.40625,
"all_wrong": 0.125,
"completion_length": 83.3828125,
"epoch": 0.2533081285444234,
"grad_norm": 1.753007888815303,
"kl": 0.060791015625,
"learning_rate": 1.699719912538434e-06,
"loss": 0.0024,
"reward": 1.6514040231704712,
"reward_std": 0.15027320384979248,
"rewards/accuracy_reward": 0.6514040231704712,
"rewards/format_reward": 1.0,
"step": 134
},
{
"all_correct": 0.1875,
"all_wrong": 0.1875,
"completion_length": 84.77734375,
"epoch": 0.2551984877126654,
"grad_norm": 2.162337902604871,
"kl": 0.05712890625,
"learning_rate": 1.6954648610743384e-06,
"loss": 0.0023,
"reward": 1.534517765045166,
"reward_std": 0.2556450366973877,
"rewards/accuracy_reward": 0.534517765045166,
"rewards/format_reward": 1.0,
"step": 135
},
{
"all_correct": 0.21875,
"all_wrong": 0.25,
"completion_length": 91.46484375,
"epoch": 0.2570888468809074,
"grad_norm": 2.2567486633448244,
"kl": 0.059326171875,
"learning_rate": 1.6911852816001217e-06,
"loss": 0.0024,
"reward": 1.4765625,
"reward_std": 0.22168521583080292,
"rewards/accuracy_reward": 0.4765624701976776,
"rewards/format_reward": 1.0,
"step": 136
},
{
"all_correct": 0.28125,
"all_wrong": 0.15625,
"completion_length": 95.22265625,
"epoch": 0.2589792060491493,
"grad_norm": 1.685961718455796,
"kl": 0.053466796875,
"learning_rate": 1.6868813250501808e-06,
"loss": 0.0021,
"reward": 1.5958139896392822,
"reward_std": 0.23510046303272247,
"rewards/accuracy_reward": 0.595814049243927,
"rewards/format_reward": 1.0,
"step": 137
},
{
"all_correct": 0.5,
"all_wrong": 0.125,
"completion_length": 92.0390625,
"epoch": 0.2608695652173913,
"grad_norm": 3.0694185267878074,
"kl": 0.0517578125,
"learning_rate": 1.682553143218654e-06,
"loss": 0.0021,
"reward": 1.6615285873413086,
"reward_std": 0.18214064836502075,
"rewards/accuracy_reward": 0.6810599565505981,
"rewards/format_reward": 0.98046875,
"step": 138
},
{
"all_correct": 0.34375,
"all_wrong": 0.1875,
"completion_length": 95.0703125,
"epoch": 0.2627599243856333,
"grad_norm": 1.8498537052874253,
"kl": 0.050537109375,
"learning_rate": 1.6782008887540702e-06,
"loss": 0.002,
"reward": 1.5477758646011353,
"reward_std": 0.20087680220603943,
"rewards/accuracy_reward": 0.55558842420578,
"rewards/format_reward": 0.9921875,
"step": 139
},
{
"all_correct": 0.25,
"all_wrong": 0.3125,
"completion_length": 90.0,
"epoch": 0.2646502835538752,
"grad_norm": 1.4885477074641043,
"kl": 0.0556640625,
"learning_rate": 1.6738247151539643e-06,
"loss": 0.0022,
"reward": 1.4492642879486084,
"reward_std": 0.17115281522274017,
"rewards/accuracy_reward": 0.4570767879486084,
"rewards/format_reward": 0.9921875,
"step": 140
},
{
"all_correct": 0.1875,
"all_wrong": 0.1875,
"completion_length": 93.39453125,
"epoch": 0.2665406427221172,
"grad_norm": 1.8807841599551895,
"kl": 0.047119140625,
"learning_rate": 1.6694247767594622e-06,
"loss": 0.0019,
"reward": 1.4642714262008667,
"reward_std": 0.260834664106369,
"rewards/accuracy_reward": 0.4838026762008667,
"rewards/format_reward": 0.98046875,
"step": 141
},
{
"all_correct": 0.46875,
"all_wrong": 0.28125,
"completion_length": 84.546875,
"epoch": 0.2684310018903592,
"grad_norm": 1.820094916728817,
"kl": 0.059814453125,
"learning_rate": 1.665001228749841e-06,
"loss": 0.0024,
"reward": 1.566421627998352,
"reward_std": 0.1025347113609314,
"rewards/accuracy_reward": 0.566421627998352,
"rewards/format_reward": 1.0,
"step": 142
},
{
"all_correct": 0.34375,
"all_wrong": 0.28125,
"completion_length": 91.03125,
"epoch": 0.27032136105860116,
"grad_norm": 5.252949042087521,
"kl": 0.05859375,
"learning_rate": 1.6605542271370511e-06,
"loss": 0.0023,
"reward": 1.5315755605697632,
"reward_std": 0.18068033456802368,
"rewards/accuracy_reward": 0.5315755605697632,
"rewards/format_reward": 1.0,
"step": 143
},
{
"all_correct": 0.4375,
"all_wrong": 0.25,
"completion_length": 100.015625,
"epoch": 0.2722117202268431,
"grad_norm": 1.417138335386092,
"kl": 0.050537109375,
"learning_rate": 1.6560839287602191e-06,
"loss": 0.002,
"reward": 1.5791447162628174,
"reward_std": 0.14070533215999603,
"rewards/accuracy_reward": 0.5986760258674622,
"rewards/format_reward": 0.98046875,
"step": 144
},
{
"all_correct": 0.40625,
"all_wrong": 0.3125,
"completion_length": 94.4375,
"epoch": 0.2741020793950851,
"grad_norm": 1.8097027337596667,
"kl": 0.050537109375,
"learning_rate": 1.6515904912801118e-06,
"loss": 0.002,
"reward": 1.4946039915084839,
"reward_std": 0.09259741008281708,
"rewards/accuracy_reward": 0.4946039319038391,
"rewards/format_reward": 1.0,
"step": 145
},
{
"all_correct": 0.375,
"all_wrong": 0.21875,
"completion_length": 90.09375,
"epoch": 0.27599243856332706,
"grad_norm": 1.7093281467382162,
"kl": 0.056884765625,
"learning_rate": 1.6470740731735786e-06,
"loss": 0.0023,
"reward": 1.5983502864837646,
"reward_std": 0.1455744206905365,
"rewards/accuracy_reward": 0.6022564768791199,
"rewards/format_reward": 0.99609375,
"step": 146
},
{
"all_correct": 0.40625,
"all_wrong": 0.375,
"completion_length": 95.01953125,
"epoch": 0.277882797731569,
"grad_norm": 1.4598558142559408,
"kl": 0.060546875,
"learning_rate": 1.6425348337279617e-06,
"loss": 0.0024,
"reward": 1.524511694908142,
"reward_std": 0.08749698102474213,
"rewards/accuracy_reward": 0.5284179449081421,
"rewards/format_reward": 0.99609375,
"step": 147
},
{
"all_correct": 0.3125,
"all_wrong": 0.1875,
"completion_length": 97.53515625,
"epoch": 0.27977315689981097,
"grad_norm": 1.6254939755919178,
"kl": 0.0478515625,
"learning_rate": 1.6379729330354773e-06,
"loss": 0.0019,
"reward": 1.5108827352523804,
"reward_std": 0.1570434868335724,
"rewards/accuracy_reward": 0.5108827352523804,
"rewards/format_reward": 1.0,
"step": 148
},
{
"all_correct": 0.40625,
"all_wrong": 0.21875,
"completion_length": 102.171875,
"epoch": 0.28166351606805295,
"grad_norm": 6.200286331983825,
"kl": 0.05322265625,
"learning_rate": 1.63338853198757e-06,
"loss": 0.0021,
"reward": 1.59765625,
"reward_std": 0.15199562907218933,
"rewards/accuracy_reward": 0.59765625,
"rewards/format_reward": 1.0,
"step": 149
},
{
"all_correct": 0.375,
"all_wrong": 0.21875,
"completion_length": 93.01953125,
"epoch": 0.2835538752362949,
"grad_norm": 1.3238758411422094,
"kl": 0.06005859375,
"learning_rate": 1.6287817922692394e-06,
"loss": 0.0024,
"reward": 1.5933270454406738,
"reward_std": 0.1732364296913147,
"rewards/accuracy_reward": 0.593326985836029,
"rewards/format_reward": 1.0,
"step": 150
},
{
"all_correct": 0.375,
"all_wrong": 0.28125,
"completion_length": 100.703125,
"epoch": 0.28544423440453687,
"grad_norm": 1.4671353082453924,
"kl": 0.05615234375,
"learning_rate": 1.6241528763533351e-06,
"loss": 0.0022,
"reward": 1.521083950996399,
"reward_std": 0.14775413274765015,
"rewards/accuracy_reward": 0.5249902009963989,
"rewards/format_reward": 0.99609375,
"step": 151
},
{
"all_correct": 0.28125,
"all_wrong": 0.21875,
"completion_length": 104.98046875,
"epoch": 0.28733459357277885,
"grad_norm": 1.7191498537725731,
"kl": 0.0517578125,
"learning_rate": 1.6195019474948298e-06,
"loss": 0.0021,
"reward": 1.5247777700424194,
"reward_std": 0.1547752171754837,
"rewards/accuracy_reward": 0.5247777700424194,
"rewards/format_reward": 1.0,
"step": 152
},
{
"all_correct": 0.34375,
"all_wrong": 0.34375,
"completion_length": 95.25,
"epoch": 0.2892249527410208,
"grad_norm": 1.1025414542451693,
"kl": 0.05224609375,
"learning_rate": 1.6148291697250592e-06,
"loss": 0.0021,
"reward": 1.4935517311096191,
"reward_std": 0.06223775073885918,
"rewards/accuracy_reward": 0.49355170130729675,
"rewards/format_reward": 1.0,
"step": 153
},
{
"all_correct": 0.3125,
"all_wrong": 0.21875,
"completion_length": 97.43359375,
"epoch": 0.29111531190926276,
"grad_norm": 3.2260317883554293,
"kl": 0.05224609375,
"learning_rate": 1.6101347078459374e-06,
"loss": 0.0021,
"reward": 1.5610017776489258,
"reward_std": 0.1826038360595703,
"rewards/accuracy_reward": 0.5610017776489258,
"rewards/format_reward": 1.0,
"step": 154
},
{
"all_correct": 0.40625,
"all_wrong": 0.125,
"completion_length": 98.3828125,
"epoch": 0.29300567107750475,
"grad_norm": 3.2855377413213036,
"kl": 0.054443359375,
"learning_rate": 1.6054187274241447e-06,
"loss": 0.0022,
"reward": 1.6555452346801758,
"reward_std": 0.21821025013923645,
"rewards/accuracy_reward": 0.6828888654708862,
"rewards/format_reward": 0.97265625,
"step": 155
},
{
"all_correct": 0.25,
"all_wrong": 0.25,
"completion_length": 99.22265625,
"epoch": 0.2948960302457467,
"grad_norm": 3.1851976233257986,
"kl": 0.049072265625,
"learning_rate": 1.6006813947852892e-06,
"loss": 0.002,
"reward": 1.4952456951141357,
"reward_std": 0.2039714902639389,
"rewards/accuracy_reward": 0.49524572491645813,
"rewards/format_reward": 1.0,
"step": 156
},
{
"all_correct": 0.28125,
"all_wrong": 0.25,
"completion_length": 92.09375,
"epoch": 0.29678638941398866,
"grad_norm": 2.1557933244114107,
"kl": 0.055908203125,
"learning_rate": 1.5959228770080389e-06,
"loss": 0.0022,
"reward": 1.54817795753479,
"reward_std": 0.19130109250545502,
"rewards/accuracy_reward": 0.54817795753479,
"rewards/format_reward": 1.0,
"step": 157
},
{
"all_correct": 0.65625,
"all_wrong": 0.0625,
"completion_length": 87.30078125,
"epoch": 0.29867674858223064,
"grad_norm": 1.5896533313014187,
"kl": 0.052490234375,
"learning_rate": 1.5911433419182304e-06,
"loss": 0.0021,
"reward": 1.7363414764404297,
"reward_std": 0.09228336811065674,
"rewards/accuracy_reward": 0.7363415956497192,
"rewards/format_reward": 1.0,
"step": 158
},
{
"all_correct": 0.34375,
"all_wrong": 0.21875,
"completion_length": 98.4140625,
"epoch": 0.3005671077504726,
"grad_norm": 8.013680822640337,
"kl": 0.051513671875,
"learning_rate": 1.5863429580829499e-06,
"loss": 0.0021,
"reward": 1.567818522453308,
"reward_std": 0.161808043718338,
"rewards/accuracy_reward": 0.5709435343742371,
"rewards/format_reward": 0.99609375,
"step": 159
},
{
"all_correct": 0.3125,
"all_wrong": 0.125,
"completion_length": 95.6484375,
"epoch": 0.30245746691871456,
"grad_norm": 1.8629346782025675,
"kl": 0.0595703125,
"learning_rate": 1.5815218948045877e-06,
"loss": 0.0024,
"reward": 1.6700589656829834,
"reward_std": 0.22937864065170288,
"rewards/accuracy_reward": 0.6778714656829834,
"rewards/format_reward": 0.9921875,
"step": 160
},
{
"all_correct": 0.28125,
"all_wrong": 0.34375,
"completion_length": 90.875,
"epoch": 0.30434782608695654,
"grad_norm": 1.2131759411170726,
"kl": 0.0556640625,
"learning_rate": 1.5766803221148673e-06,
"loss": 0.0022,
"reward": 1.476467490196228,
"reward_std": 0.1405719369649887,
"rewards/accuracy_reward": 0.4764673709869385,
"rewards/format_reward": 1.0,
"step": 161
},
{
"all_correct": 0.34375,
"all_wrong": 0.15625,
"completion_length": 98.62109375,
"epoch": 0.30623818525519847,
"grad_norm": 1.575835196195424,
"kl": 0.049072265625,
"learning_rate": 1.571818410768848e-06,
"loss": 0.002,
"reward": 1.5944631099700928,
"reward_std": 0.18872258067131042,
"rewards/accuracy_reward": 0.594463050365448,
"rewards/format_reward": 1.0,
"step": 162
},
{
"all_correct": 0.28125,
"all_wrong": 0.28125,
"completion_length": 98.5078125,
"epoch": 0.30812854442344045,
"grad_norm": 1.3803997172387867,
"kl": 0.04736328125,
"learning_rate": 1.566936332238904e-06,
"loss": 0.0019,
"reward": 1.5406312942504883,
"reward_std": 0.14286097884178162,
"rewards/accuracy_reward": 0.5484437942504883,
"rewards/format_reward": 0.9921875,
"step": 163
},
{
"all_correct": 0.34375,
"all_wrong": 0.15625,
"completion_length": 96.4296875,
"epoch": 0.31001890359168244,
"grad_norm": 4.219394179057573,
"kl": 0.05615234375,
"learning_rate": 1.5620342587086756e-06,
"loss": 0.0022,
"reward": 1.6263850927352905,
"reward_std": 0.19135481119155884,
"rewards/accuracy_reward": 0.6263850927352905,
"rewards/format_reward": 1.0,
"step": 164
},
{
"all_correct": 0.34375,
"all_wrong": 0.28125,
"completion_length": 95.87890625,
"epoch": 0.31190926275992437,
"grad_norm": 1.4782346468519787,
"kl": 0.048095703125,
"learning_rate": 1.5571123630669977e-06,
"loss": 0.0019,
"reward": 1.5589654445648193,
"reward_std": 0.15776313841342926,
"rewards/accuracy_reward": 0.5628716349601746,
"rewards/format_reward": 0.99609375,
"step": 165
},
{
"all_correct": 0.21875,
"all_wrong": 0.21875,
"completion_length": 99.24609375,
"epoch": 0.31379962192816635,
"grad_norm": 1.7150440549446804,
"kl": 0.04736328125,
"learning_rate": 1.5521708189018004e-06,
"loss": 0.0019,
"reward": 1.4944391250610352,
"reward_std": 0.2474714070558548,
"rewards/accuracy_reward": 0.4944390654563904,
"rewards/format_reward": 1.0,
"step": 166
},
{
"all_correct": 0.34375,
"all_wrong": 0.21875,
"completion_length": 92.48828125,
"epoch": 0.31568998109640833,
"grad_norm": 2.0979330226121116,
"kl": 0.05224609375,
"learning_rate": 1.5472098004939887e-06,
"loss": 0.0021,
"reward": 1.606818675994873,
"reward_std": 0.19165176153182983,
"rewards/accuracy_reward": 0.610724925994873,
"rewards/format_reward": 0.99609375,
"step": 167
},
{
"all_correct": 0.375,
"all_wrong": 0.4375,
"completion_length": 98.86328125,
"epoch": 0.31758034026465026,
"grad_norm": 1.741385766018968,
"kl": 0.0458984375,
"learning_rate": 1.5422294828112952e-06,
"loss": 0.0018,
"reward": 1.4513907432556152,
"reward_std": 0.07461512833833694,
"rewards/accuracy_reward": 0.45529699325561523,
"rewards/format_reward": 0.99609375,
"step": 168
},
{
"all_correct": 0.15625,
"all_wrong": 0.28125,
"completion_length": 104.0859375,
"epoch": 0.31947069943289225,
"grad_norm": 2.346864808547952,
"kl": 0.041748046875,
"learning_rate": 1.537230041502109e-06,
"loss": 0.0017,
"reward": 1.4551925659179688,
"reward_std": 0.19008949398994446,
"rewards/accuracy_reward": 0.46691131591796875,
"rewards/format_reward": 0.98828125,
"step": 169
},
{
"all_correct": 0.5,
"all_wrong": 0.15625,
"completion_length": 90.89453125,
"epoch": 0.32136105860113423,
"grad_norm": 1.6647816583702195,
"kl": 0.05078125,
"learning_rate": 1.5322116528892807e-06,
"loss": 0.002,
"reward": 1.731924295425415,
"reward_std": 0.12868990004062653,
"rewards/accuracy_reward": 0.7319241762161255,
"rewards/format_reward": 1.0,
"step": 170
},
{
"all_correct": 0.28125,
"all_wrong": 0.28125,
"completion_length": 97.6015625,
"epoch": 0.32325141776937616,
"grad_norm": 1.4603160187510866,
"kl": 0.04833984375,
"learning_rate": 1.527174493963905e-06,
"loss": 0.0019,
"reward": 1.4863568544387817,
"reward_std": 0.18577060103416443,
"rewards/accuracy_reward": 0.49416929483413696,
"rewards/format_reward": 0.9921875,
"step": 171
},
{
"all_correct": 0.46875,
"all_wrong": 0.21875,
"completion_length": 95.61328125,
"epoch": 0.32514177693761814,
"grad_norm": 1.2779207893845528,
"kl": 0.05419921875,
"learning_rate": 1.5221187423790758e-06,
"loss": 0.0022,
"reward": 1.6187939643859863,
"reward_std": 0.1073196530342102,
"rewards/accuracy_reward": 0.6227001547813416,
"rewards/format_reward": 0.99609375,
"step": 172
},
{
"all_correct": 0.34375,
"all_wrong": 0.21875,
"completion_length": 95.03125,
"epoch": 0.3270321361058601,
"grad_norm": 1.594280468798857,
"kl": 0.05712890625,
"learning_rate": 1.517044576443625e-06,
"loss": 0.0023,
"reward": 1.5663572549819946,
"reward_std": 0.14794546365737915,
"rewards/accuracy_reward": 0.5663573741912842,
"rewards/format_reward": 1.0,
"step": 173
},
{
"all_correct": 0.5,
"all_wrong": 0.125,
"completion_length": 96.19921875,
"epoch": 0.32892249527410206,
"grad_norm": 35.90029700014664,
"kl": 0.056396484375,
"learning_rate": 1.5119521751158296e-06,
"loss": 0.0023,
"reward": 1.7366479635238647,
"reward_std": 0.1478818655014038,
"rewards/accuracy_reward": 0.7366479635238647,
"rewards/format_reward": 1.0,
"step": 174
},
{
"all_correct": 0.46875,
"all_wrong": 0.1875,
"completion_length": 94.14453125,
"epoch": 0.33081285444234404,
"grad_norm": 1.1879295692139324,
"kl": 0.05029296875,
"learning_rate": 1.5068417179971013e-06,
"loss": 0.002,
"reward": 1.6065101623535156,
"reward_std": 0.1347871571779251,
"rewards/accuracy_reward": 0.6104164123535156,
"rewards/format_reward": 0.99609375,
"step": 175
},
{
"all_correct": 0.4375,
"all_wrong": 0.1875,
"completion_length": 92.0078125,
"epoch": 0.332703213610586,
"grad_norm": 1.7027171491945203,
"kl": 0.056640625,
"learning_rate": 1.5017133853256536e-06,
"loss": 0.0023,
"reward": 1.6415752172470093,
"reward_std": 0.15971241891384125,
"rewards/accuracy_reward": 0.6415751576423645,
"rewards/format_reward": 1.0,
"step": 176
},
{
"all_correct": 0.4375,
"all_wrong": 0.25,
"completion_length": 93.62109375,
"epoch": 0.33459357277882795,
"grad_norm": 1.3008308802468183,
"kl": 0.05517578125,
"learning_rate": 1.4965673579701444e-06,
"loss": 0.0022,
"reward": 1.5429213047027588,
"reward_std": 0.11785108596086502,
"rewards/accuracy_reward": 0.5546400547027588,
"rewards/format_reward": 0.98828125,
"step": 177
},
{
"all_correct": 0.21875,
"all_wrong": 0.34375,
"completion_length": 101.2578125,
"epoch": 0.33648393194706994,
"grad_norm": 2.0499201856789537,
"kl": 0.04638671875,
"learning_rate": 1.4914038174232954e-06,
"loss": 0.0019,
"reward": 1.4305205345153809,
"reward_std": 0.18109694123268127,
"rewards/accuracy_reward": 0.4383331537246704,
"rewards/format_reward": 0.9921875,
"step": 178
},
{
"all_correct": 0.46875,
"all_wrong": 0.15625,
"completion_length": 94.9765625,
"epoch": 0.3383742911153119,
"grad_norm": 1.3560089780453208,
"kl": 0.051025390625,
"learning_rate": 1.4862229457954937e-06,
"loss": 0.002,
"reward": 1.6644376516342163,
"reward_std": 0.12696640193462372,
"rewards/accuracy_reward": 0.6644376516342163,
"rewards/format_reward": 1.0,
"step": 179
},
{
"all_correct": 0.4375,
"all_wrong": 0.21875,
"completion_length": 95.62109375,
"epoch": 0.34026465028355385,
"grad_norm": 1.312847563615755,
"kl": 0.052490234375,
"learning_rate": 1.4810249258083676e-06,
"loss": 0.0021,
"reward": 1.621772050857544,
"reward_std": 0.15236923098564148,
"rewards/accuracy_reward": 0.621772050857544,
"rewards/format_reward": 1.0,
"step": 180
},
{
"all_correct": 0.40625,
"all_wrong": 0.3125,
"completion_length": 97.97265625,
"epoch": 0.34215500945179583,
"grad_norm": 1.5335653397548066,
"kl": 0.047119140625,
"learning_rate": 1.475809940788342e-06,
"loss": 0.0019,
"reward": 1.5216861963272095,
"reward_std": 0.09969654679298401,
"rewards/accuracy_reward": 0.5255923867225647,
"rewards/format_reward": 0.99609375,
"step": 181
},
{
"all_correct": 0.34375,
"all_wrong": 0.1875,
"completion_length": 90.13671875,
"epoch": 0.3440453686200378,
"grad_norm": 1.6455259403603757,
"kl": 0.050537109375,
"learning_rate": 1.4705781746601738e-06,
"loss": 0.002,
"reward": 1.569726586341858,
"reward_std": 0.20621807873249054,
"rewards/accuracy_reward": 0.5736328363418579,
"rewards/format_reward": 0.99609375,
"step": 182
},
{
"all_correct": 0.34375,
"all_wrong": 0.21875,
"completion_length": 95.23828125,
"epoch": 0.34593572778827975,
"grad_norm": 1.7087145386845783,
"kl": 0.048095703125,
"learning_rate": 1.4653298119404645e-06,
"loss": 0.0019,
"reward": 1.566096544265747,
"reward_std": 0.16752052307128906,
"rewards/accuracy_reward": 0.5660964846611023,
"rewards/format_reward": 1.0,
"step": 183
},
{
"all_correct": 0.4375,
"all_wrong": 0.28125,
"completion_length": 86.921875,
"epoch": 0.34782608695652173,
"grad_norm": 1.4554606229566311,
"kl": 0.054443359375,
"learning_rate": 1.460065037731152e-06,
"loss": 0.0022,
"reward": 1.592024326324463,
"reward_std": 0.10995283722877502,
"rewards/accuracy_reward": 0.5920243859291077,
"rewards/format_reward": 1.0,
"step": 184
},
{
"all_correct": 0.5,
"all_wrong": 0.21875,
"completion_length": 94.37890625,
"epoch": 0.3497164461247637,
"grad_norm": 1.030689978981618,
"kl": 0.044921875,
"learning_rate": 1.454784037712984e-06,
"loss": 0.0018,
"reward": 1.6271023750305176,
"reward_std": 0.09506059437990189,
"rewards/accuracy_reward": 0.6271023750305176,
"rewards/format_reward": 1.0,
"step": 185
},
{
"all_correct": 0.5625,
"all_wrong": 0.1875,
"completion_length": 92.4765625,
"epoch": 0.3516068052930057,
"grad_norm": 2.3312930785993657,
"kl": 0.05224609375,
"learning_rate": 1.449486998138968e-06,
"loss": 0.0021,
"reward": 1.6988677978515625,
"reward_std": 0.08607158064842224,
"rewards/accuracy_reward": 0.698867678642273,
"rewards/format_reward": 1.0,
"step": 186
},
{
"all_correct": 0.3125,
"all_wrong": 0.3125,
"completion_length": 94.50390625,
"epoch": 0.3534971644612476,
"grad_norm": 1.370501060551843,
"kl": 0.050048828125,
"learning_rate": 1.4441741058278024e-06,
"loss": 0.002,
"reward": 1.5129822492599487,
"reward_std": 0.1419016569852829,
"rewards/accuracy_reward": 0.5129822492599487,
"rewards/format_reward": 1.0,
"step": 187
},
{
"all_correct": 0.4375,
"all_wrong": 0.1875,
"completion_length": 89.53515625,
"epoch": 0.3553875236294896,
"grad_norm": 1.4724253630775335,
"kl": 0.048828125,
"learning_rate": 1.4388455481572878e-06,
"loss": 0.002,
"reward": 1.6355903148651123,
"reward_std": 0.14424622058868408,
"rewards/accuracy_reward": 0.6394965648651123,
"rewards/format_reward": 0.99609375,
"step": 188
},
{
"all_correct": 0.46875,
"all_wrong": 0.21875,
"completion_length": 99.15234375,
"epoch": 0.3572778827977316,
"grad_norm": 1.1589530938405725,
"kl": 0.046142578125,
"learning_rate": 1.4335015130577198e-06,
"loss": 0.0018,
"reward": 1.6473121643066406,
"reward_std": 0.10182252526283264,
"rewards/accuracy_reward": 0.6473122239112854,
"rewards/format_reward": 1.0,
"step": 189
},
{
"all_correct": 0.5,
"all_wrong": 0.125,
"completion_length": 86.296875,
"epoch": 0.3591682419659735,
"grad_norm": 1.5269460633562093,
"kl": 0.04541015625,
"learning_rate": 1.428142189005259e-06,
"loss": 0.0018,
"reward": 1.7168800830841064,
"reward_std": 0.12831273674964905,
"rewards/accuracy_reward": 0.7168800234794617,
"rewards/format_reward": 1.0,
"step": 190
},
{
"all_correct": 0.46875,
"all_wrong": 0.125,
"completion_length": 90.84765625,
"epoch": 0.3610586011342155,
"grad_norm": 1.5710398769636844,
"kl": 0.051513671875,
"learning_rate": 1.4227677650152847e-06,
"loss": 0.0021,
"reward": 1.6753089427947998,
"reward_std": 0.15371738374233246,
"rewards/accuracy_reward": 0.6753089427947998,
"rewards/format_reward": 1.0,
"step": 191
},
{
"all_correct": 0.5625,
"all_wrong": 0.15625,
"completion_length": 97.41015625,
"epoch": 0.3629489603024575,
"grad_norm": 1.3229536281928582,
"kl": 0.0478515625,
"learning_rate": 1.417378430635729e-06,
"loss": 0.0019,
"reward": 1.6942713260650635,
"reward_std": 0.08952474594116211,
"rewards/accuracy_reward": 0.6942713260650635,
"rewards/format_reward": 1.0,
"step": 192
},
{
"all_correct": 0.34375,
"all_wrong": 0.1875,
"completion_length": 85.03125,
"epoch": 0.3648393194706994,
"grad_norm": 2.579144764904904,
"kl": 0.0537109375,
"learning_rate": 1.4119743759403907e-06,
"loss": 0.0021,
"reward": 1.609615683555603,
"reward_std": 0.1612345427274704,
"rewards/accuracy_reward": 0.609615683555603,
"rewards/format_reward": 1.0,
"step": 193
},
{
"all_correct": 0.46875,
"all_wrong": 0.125,
"completion_length": 98.421875,
"epoch": 0.3667296786389414,
"grad_norm": 4.601074879313099,
"kl": 0.055908203125,
"learning_rate": 1.406555791522232e-06,
"loss": 0.0022,
"reward": 1.6692792177200317,
"reward_std": 0.1721545159816742,
"rewards/accuracy_reward": 0.6731854677200317,
"rewards/format_reward": 0.99609375,
"step": 194
},
{
"all_correct": 0.46875,
"all_wrong": 0.25,
"completion_length": 86.79296875,
"epoch": 0.3686200378071834,
"grad_norm": 1.505355960378617,
"kl": 0.048095703125,
"learning_rate": 1.401122868486658e-06,
"loss": 0.0019,
"reward": 1.6007030010223389,
"reward_std": 0.1112457737326622,
"rewards/accuracy_reward": 0.6007030606269836,
"rewards/format_reward": 1.0,
"step": 195
},
{
"all_correct": 0.28125,
"all_wrong": 0.1875,
"completion_length": 92.05078125,
"epoch": 0.3705103969754253,
"grad_norm": 4.21038845661736,
"kl": 0.048095703125,
"learning_rate": 1.3956757984447744e-06,
"loss": 0.0019,
"reward": 1.5570985078811646,
"reward_std": 0.15561020374298096,
"rewards/accuracy_reward": 0.5570985078811646,
"rewards/format_reward": 1.0,
"step": 196
},
{
"all_correct": 0.46875,
"all_wrong": 0.1875,
"completion_length": 108.05859375,
"epoch": 0.3724007561436673,
"grad_norm": 1.8421976998281828,
"kl": 0.0458984375,
"learning_rate": 1.3902147735066305e-06,
"loss": 0.0018,
"reward": 1.6113016605377197,
"reward_std": 0.153774231672287,
"rewards/accuracy_reward": 0.615207850933075,
"rewards/format_reward": 0.99609375,
"step": 197
},
{
"all_correct": 0.375,
"all_wrong": 0.15625,
"completion_length": 100.32421875,
"epoch": 0.3742911153119093,
"grad_norm": 1.4377049848914711,
"kl": 0.048583984375,
"learning_rate": 1.3847399862744449e-06,
"loss": 0.0019,
"reward": 1.5971312522888184,
"reward_std": 0.1938011348247528,
"rewards/accuracy_reward": 0.6010375022888184,
"rewards/format_reward": 0.99609375,
"step": 198
},
{
"all_correct": 0.375,
"all_wrong": 0.09375,
"completion_length": 94.0,
"epoch": 0.3761814744801512,
"grad_norm": 2.662364277169509,
"kl": 0.053466796875,
"learning_rate": 1.3792516298358115e-06,
"loss": 0.0021,
"reward": 1.6497100591659546,
"reward_std": 0.12323208153247833,
"rewards/accuracy_reward": 0.6497100591659546,
"rewards/format_reward": 1.0,
"step": 199
},
{
"all_correct": 0.375,
"all_wrong": 0.15625,
"completion_length": 93.59765625,
"epoch": 0.3780718336483932,
"grad_norm": 1.7035529891437522,
"kl": 0.049560546875,
"learning_rate": 1.37374989775689e-06,
"loss": 0.002,
"reward": 1.563701868057251,
"reward_std": 0.1927732229232788,
"rewards/accuracy_reward": 0.563701868057251,
"rewards/format_reward": 1.0,
"step": 200
},
{
"all_correct": 0.4375,
"all_wrong": 0.21875,
"completion_length": 85.83203125,
"epoch": 0.3799621928166352,
"grad_norm": 1.2571615120784285,
"kl": 0.0537109375,
"learning_rate": 1.3682349840755786e-06,
"loss": 0.0021,
"reward": 1.6499078273773193,
"reward_std": 0.12151362746953964,
"rewards/accuracy_reward": 0.6538141369819641,
"rewards/format_reward": 0.99609375,
"step": 201
},
{
"all_correct": 0.1875,
"all_wrong": 0.1875,
"completion_length": 92.55078125,
"epoch": 0.3818525519848771,
"grad_norm": 1.8828233098315525,
"kl": 0.052978515625,
"learning_rate": 1.3627070832946716e-06,
"loss": 0.0021,
"reward": 1.5092294216156006,
"reward_std": 0.23872140049934387,
"rewards/accuracy_reward": 0.5092294216156006,
"rewards/format_reward": 1.0,
"step": 202
},
{
"all_correct": 0.40625,
"all_wrong": 0.1875,
"completion_length": 90.94140625,
"epoch": 0.3837429111531191,
"grad_norm": 1.4414792921156796,
"kl": 0.04833984375,
"learning_rate": 1.3571663903749984e-06,
"loss": 0.0019,
"reward": 1.5634148120880127,
"reward_std": 0.18181678652763367,
"rewards/accuracy_reward": 0.5673210024833679,
"rewards/format_reward": 0.99609375,
"step": 203
},
{
"all_correct": 0.375,
"all_wrong": 0.21875,
"completion_length": 93.390625,
"epoch": 0.3856332703213611,
"grad_norm": 1.76318060561253,
"kl": 0.05615234375,
"learning_rate": 1.351613100728548e-06,
"loss": 0.0022,
"reward": 1.5766992568969727,
"reward_std": 0.18105342984199524,
"rewards/accuracy_reward": 0.5766991972923279,
"rewards/format_reward": 1.0,
"step": 204
},
{
"all_correct": 0.34375,
"all_wrong": 0.1875,
"completion_length": 91.85546875,
"epoch": 0.387523629489603,
"grad_norm": 1.498906587190416,
"kl": 0.046142578125,
"learning_rate": 1.3460474102115784e-06,
"loss": 0.0018,
"reward": 1.5816829204559326,
"reward_std": 0.2008872926235199,
"rewards/accuracy_reward": 0.5816829204559326,
"rewards/format_reward": 1.0,
"step": 205
},
{
"all_correct": 0.3125,
"all_wrong": 0.1875,
"completion_length": 91.69921875,
"epoch": 0.389413988657845,
"grad_norm": 2.5901803764065447,
"kl": 0.052001953125,
"learning_rate": 1.340469515117706e-06,
"loss": 0.0021,
"reward": 1.5882611274719238,
"reward_std": 0.1939556896686554,
"rewards/accuracy_reward": 0.5882611274719238,
"rewards/format_reward": 1.0,
"step": 206
},
{
"all_correct": 0.34375,
"all_wrong": 0.125,
"completion_length": 85.96875,
"epoch": 0.391304347826087,
"grad_norm": 1.972889284821711,
"kl": 0.05615234375,
"learning_rate": 1.334879612170986e-06,
"loss": 0.0022,
"reward": 1.667292833328247,
"reward_std": 0.2238282561302185,
"rewards/accuracy_reward": 0.6672928929328918,
"rewards/format_reward": 1.0,
"step": 207
},
{
"all_correct": 0.25,
"all_wrong": 0.25,
"completion_length": 93.765625,
"epoch": 0.3931947069943289,
"grad_norm": 1.7336225636379132,
"kl": 0.0556640625,
"learning_rate": 1.3292778985189722e-06,
"loss": 0.0022,
"reward": 1.4546148777008057,
"reward_std": 0.202288419008255,
"rewards/accuracy_reward": 0.4546148180961609,
"rewards/format_reward": 1.0,
"step": 208
},
{
"all_correct": 0.34375,
"all_wrong": 0.25,
"completion_length": 93.42578125,
"epoch": 0.3950850661625709,
"grad_norm": 1.5326638144326488,
"kl": 0.04296875,
"learning_rate": 1.323664571725764e-06,
"loss": 0.0017,
"reward": 1.581210970878601,
"reward_std": 0.13675948977470398,
"rewards/accuracy_reward": 0.5812109708786011,
"rewards/format_reward": 1.0,
"step": 209
},
{
"all_correct": 0.28125,
"all_wrong": 0.25,
"completion_length": 90.45703125,
"epoch": 0.39697542533081287,
"grad_norm": 1.6717753066186387,
"kl": 0.055419921875,
"learning_rate": 1.3180398297650392e-06,
"loss": 0.0022,
"reward": 1.5307865142822266,
"reward_std": 0.21199406683444977,
"rewards/accuracy_reward": 0.534692645072937,
"rewards/format_reward": 0.99609375,
"step": 210
},
{
"all_correct": 0.25,
"all_wrong": 0.15625,
"completion_length": 88.4375,
"epoch": 0.3988657844990548,
"grad_norm": 3.0589163307910163,
"kl": 0.045654296875,
"learning_rate": 1.3124038710130721e-06,
"loss": 0.0018,
"reward": 1.4704865217208862,
"reward_std": 0.22353151440620422,
"rewards/accuracy_reward": 0.470486581325531,
"rewards/format_reward": 1.0,
"step": 211
},
{
"all_correct": 0.3125,
"all_wrong": 0.21875,
"completion_length": 91.19140625,
"epoch": 0.4007561436672968,
"grad_norm": 1.7075754827161425,
"kl": 0.052734375,
"learning_rate": 1.3067568942417354e-06,
"loss": 0.0021,
"reward": 1.568968415260315,
"reward_std": 0.19819122552871704,
"rewards/accuracy_reward": 0.5689684152603149,
"rewards/format_reward": 1.0,
"step": 212
},
{
"all_correct": 0.3125,
"all_wrong": 0.15625,
"completion_length": 92.08984375,
"epoch": 0.40264650283553877,
"grad_norm": 2.1354813874349254,
"kl": 0.05029296875,
"learning_rate": 1.3010990986114924e-06,
"loss": 0.002,
"reward": 1.533203125,
"reward_std": 0.2084732949733734,
"rewards/accuracy_reward": 0.552734375,
"rewards/format_reward": 0.98046875,
"step": 213
},
{
"all_correct": 0.25,
"all_wrong": 0.21875,
"completion_length": 94.58203125,
"epoch": 0.4045368620037807,
"grad_norm": 2.165953349203724,
"kl": 0.04345703125,
"learning_rate": 1.29543068366437e-06,
"loss": 0.0017,
"reward": 1.511252760887146,
"reward_std": 0.2207871973514557,
"rewards/accuracy_reward": 0.511252760887146,
"rewards/format_reward": 1.0,
"step": 214
},
{
"all_correct": 0.34375,
"all_wrong": 0.28125,
"completion_length": 91.39453125,
"epoch": 0.4064272211720227,
"grad_norm": 1.8035046342920265,
"kl": 0.05322265625,
"learning_rate": 1.2897518493169238e-06,
"loss": 0.0021,
"reward": 1.5383806228637695,
"reward_std": 0.11602434515953064,
"rewards/accuracy_reward": 0.54228675365448,
"rewards/format_reward": 0.99609375,
"step": 215
},
{
"all_correct": 0.375,
"all_wrong": 0.15625,
"completion_length": 85.265625,
"epoch": 0.40831758034026466,
"grad_norm": 15.766212379911176,
"kl": 0.045654296875,
"learning_rate": 1.284062795853185e-06,
"loss": 0.0018,
"reward": 1.633461356163025,
"reward_std": 0.1580500602722168,
"rewards/accuracy_reward": 0.6334613561630249,
"rewards/format_reward": 1.0,
"step": 216
},
{
"all_correct": 0.4375,
"all_wrong": 0.09375,
"completion_length": 91.390625,
"epoch": 0.4102079395085066,
"grad_norm": 1.5655799499341232,
"kl": 0.05078125,
"learning_rate": 1.2783637239175992e-06,
"loss": 0.002,
"reward": 1.670259952545166,
"reward_std": 0.18198764324188232,
"rewards/accuracy_reward": 0.670259952545166,
"rewards/format_reward": 1.0,
"step": 217
},
{
"all_correct": 0.4375,
"all_wrong": 0.125,
"completion_length": 86.6171875,
"epoch": 0.4120982986767486,
"grad_norm": 3.0507186922583482,
"kl": 0.0517578125,
"learning_rate": 1.2726548345079474e-06,
"loss": 0.0021,
"reward": 1.6517325639724731,
"reward_std": 0.17904990911483765,
"rewards/accuracy_reward": 0.6517325639724731,
"rewards/format_reward": 1.0,
"step": 218
},
{
"all_correct": 0.375,
"all_wrong": 0.21875,
"completion_length": 92.4140625,
"epoch": 0.41398865784499056,
"grad_norm": 2.412347509180631,
"kl": 0.049560546875,
"learning_rate": 1.2669363289682581e-06,
"loss": 0.002,
"reward": 1.55078125,
"reward_std": 0.20410458743572235,
"rewards/accuracy_reward": 0.57421875,
"rewards/format_reward": 0.9765625,
"step": 219
},
{
"all_correct": 0.40625,
"all_wrong": 0.1875,
"completion_length": 100.796875,
"epoch": 0.4158790170132325,
"grad_norm": 1.1653385467510216,
"kl": 0.05078125,
"learning_rate": 1.261208408981708e-06,
"loss": 0.002,
"reward": 1.5812370777130127,
"reward_std": 0.16820110380649567,
"rewards/accuracy_reward": 0.5968619585037231,
"rewards/format_reward": 0.984375,
"step": 220
},
{
"all_correct": 0.5,
"all_wrong": 0.1875,
"completion_length": 89.671875,
"epoch": 0.41776937618147447,
"grad_norm": 1.4929073817173293,
"kl": 0.048828125,
"learning_rate": 1.2554712765635057e-06,
"loss": 0.0019,
"reward": 1.6370710134506226,
"reward_std": 0.11428119242191315,
"rewards/accuracy_reward": 0.6370710134506226,
"rewards/format_reward": 1.0,
"step": 221
},
{
"all_correct": 0.28125,
"all_wrong": 0.1875,
"completion_length": 85.70703125,
"epoch": 0.41965973534971646,
"grad_norm": 2.2938159154637265,
"kl": 0.052490234375,
"learning_rate": 1.2497251340537688e-06,
"loss": 0.0021,
"reward": 1.5362218618392944,
"reward_std": 0.2033883035182953,
"rewards/accuracy_reward": 0.5440343022346497,
"rewards/format_reward": 0.9921875,
"step": 222
},
{
"all_correct": 0.375,
"all_wrong": 0.09375,
"completion_length": 88.765625,
"epoch": 0.4215500945179584,
"grad_norm": 1.590914762810909,
"kl": 0.045654296875,
"learning_rate": 1.2439701841103886e-06,
"loss": 0.0018,
"reward": 1.672126293182373,
"reward_std": 0.18559589982032776,
"rewards/accuracy_reward": 0.6721263527870178,
"rewards/format_reward": 1.0,
"step": 223
},
{
"all_correct": 0.21875,
"all_wrong": 0.21875,
"completion_length": 88.734375,
"epoch": 0.42344045368620037,
"grad_norm": 4.4273662063319925,
"kl": 0.051025390625,
"learning_rate": 1.2382066297018804e-06,
"loss": 0.002,
"reward": 1.5649325847625732,
"reward_std": 0.2352992594242096,
"rewards/accuracy_reward": 0.5649325847625732,
"rewards/format_reward": 1.0,
"step": 224
},
{
"all_correct": 0.5,
"all_wrong": 0.1875,
"completion_length": 88.78515625,
"epoch": 0.42533081285444235,
"grad_norm": 2.4145377912675245,
"kl": 0.048828125,
"learning_rate": 1.2324346741002259e-06,
"loss": 0.002,
"reward": 1.6205267906188965,
"reward_std": 0.1337902843952179,
"rewards/accuracy_reward": 0.6205266714096069,
"rewards/format_reward": 1.0,
"step": 225
},
{
"all_correct": 0.21875,
"all_wrong": 0.28125,
"completion_length": 87.2734375,
"epoch": 0.42722117202268434,
"grad_norm": 1.3878030528846113,
"kl": 0.049072265625,
"learning_rate": 1.2266545208737054e-06,
"loss": 0.002,
"reward": 1.4790351390838623,
"reward_std": 0.1779412180185318,
"rewards/accuracy_reward": 0.47903522849082947,
"rewards/format_reward": 1.0,
"step": 226
},
{
"all_correct": 0.375,
"all_wrong": 0.3125,
"completion_length": 85.23046875,
"epoch": 0.42911153119092627,
"grad_norm": 2.600137524410405,
"kl": 0.05126953125,
"learning_rate": 1.2208663738797165e-06,
"loss": 0.0021,
"reward": 1.5255839824676514,
"reward_std": 0.10616068542003632,
"rewards/accuracy_reward": 0.5255839824676514,
"rewards/format_reward": 1.0,
"step": 227
},
{
"all_correct": 0.34375,
"all_wrong": 0.25,
"completion_length": 89.4375,
"epoch": 0.43100189035916825,
"grad_norm": 2.29638301880618,
"kl": 0.0517578125,
"learning_rate": 1.2150704372575853e-06,
"loss": 0.0021,
"reward": 1.525526762008667,
"reward_std": 0.1583402007818222,
"rewards/accuracy_reward": 0.5255266427993774,
"rewards/format_reward": 1.0,
"step": 228
},
{
"all_correct": 0.53125,
"all_wrong": 0.21875,
"completion_length": 84.92578125,
"epoch": 0.43289224952741023,
"grad_norm": 1.144130055159072,
"kl": 0.05322265625,
"learning_rate": 1.2092669154213664e-06,
"loss": 0.0021,
"reward": 1.5835583209991455,
"reward_std": 0.09602068364620209,
"rewards/accuracy_reward": 0.5874645113945007,
"rewards/format_reward": 0.99609375,
"step": 229
},
{
"all_correct": 0.28125,
"all_wrong": 0.15625,
"completion_length": 102.9765625,
"epoch": 0.43478260869565216,
"grad_norm": 2.039569056317988,
"kl": 0.048583984375,
"learning_rate": 1.203456013052634e-06,
"loss": 0.0019,
"reward": 1.5296072959899902,
"reward_std": 0.2428017556667328,
"rewards/accuracy_reward": 0.529607355594635,
"rewards/format_reward": 1.0,
"step": 230
},
{
"all_correct": 0.40625,
"all_wrong": 0.1875,
"completion_length": 100.609375,
"epoch": 0.43667296786389415,
"grad_norm": 1.8374962293393675,
"kl": 0.046630859375,
"learning_rate": 1.1976379350932618e-06,
"loss": 0.0019,
"reward": 1.6126770973205566,
"reward_std": 0.17599979043006897,
"rewards/accuracy_reward": 0.6126769781112671,
"rewards/format_reward": 1.0,
"step": 231
},
{
"all_correct": 0.46875,
"all_wrong": 0.15625,
"completion_length": 86.33203125,
"epoch": 0.43856332703213613,
"grad_norm": 1.3726940756027504,
"kl": 0.04638671875,
"learning_rate": 1.1918128867381965e-06,
"loss": 0.0019,
"reward": 1.6991832256317139,
"reward_std": 0.14738750457763672,
"rewards/accuracy_reward": 0.7030894160270691,
"rewards/format_reward": 0.99609375,
"step": 232
},
{
"all_correct": 0.375,
"all_wrong": 0.3125,
"completion_length": 94.38671875,
"epoch": 0.44045368620037806,
"grad_norm": 2.07132527283432,
"kl": 0.044189453125,
"learning_rate": 1.1859810734282207e-06,
"loss": 0.0018,
"reward": 1.5090982913970947,
"reward_std": 0.12135301530361176,
"rewards/accuracy_reward": 0.5325357913970947,
"rewards/format_reward": 0.9765625,
"step": 233
},
{
"all_correct": 0.375,
"all_wrong": 0.34375,
"completion_length": 89.13671875,
"epoch": 0.44234404536862004,
"grad_norm": 1.261505450158093,
"kl": 0.049072265625,
"learning_rate": 1.1801427008427063e-06,
"loss": 0.002,
"reward": 1.5006786584854126,
"reward_std": 0.11618545651435852,
"rewards/accuracy_reward": 0.5006786584854126,
"rewards/format_reward": 1.0,
"step": 234
},
{
"all_correct": 0.5625,
"all_wrong": 0.15625,
"completion_length": 88.6015625,
"epoch": 0.444234404536862,
"grad_norm": 1.6264673282441953,
"kl": 0.048583984375,
"learning_rate": 1.1742979748923608e-06,
"loss": 0.0019,
"reward": 1.7040953636169434,
"reward_std": 0.08967425674200058,
"rewards/accuracy_reward": 0.7040954232215881,
"rewards/format_reward": 1.0,
"step": 235
},
{
"all_correct": 0.375,
"all_wrong": 0.21875,
"completion_length": 87.04296875,
"epoch": 0.44612476370510395,
"grad_norm": 1.8797855226580642,
"kl": 0.055908203125,
"learning_rate": 1.1684471017119665e-06,
"loss": 0.0022,
"reward": 1.5616300106048584,
"reward_std": 0.10765929520130157,
"rewards/accuracy_reward": 0.565536379814148,
"rewards/format_reward": 0.99609375,
"step": 236
},
{
"all_correct": 0.34375,
"all_wrong": 0.25,
"completion_length": 95.9375,
"epoch": 0.44801512287334594,
"grad_norm": 1.3012401350602132,
"kl": 0.04541015625,
"learning_rate": 1.1625902876531083e-06,
"loss": 0.0018,
"reward": 1.4816043376922607,
"reward_std": 0.16820675134658813,
"rewards/accuracy_reward": 0.5011356472969055,
"rewards/format_reward": 0.98046875,
"step": 237
},
{
"all_correct": 0.40625,
"all_wrong": 0.09375,
"completion_length": 92.08203125,
"epoch": 0.4499054820415879,
"grad_norm": 2.6028023149547193,
"kl": 0.046630859375,
"learning_rate": 1.156727739276897e-06,
"loss": 0.0019,
"reward": 1.7202575206756592,
"reward_std": 0.14813324809074402,
"rewards/accuracy_reward": 0.7241637706756592,
"rewards/format_reward": 0.99609375,
"step": 238
},
{
"all_correct": 0.4375,
"all_wrong": 0.28125,
"completion_length": 99.9296875,
"epoch": 0.45179584120982985,
"grad_norm": 2.287302725917376,
"kl": 0.048583984375,
"learning_rate": 1.1508596633466853e-06,
"loss": 0.0019,
"reward": 1.5219180583953857,
"reward_std": 0.0942273810505867,
"rewards/accuracy_reward": 0.5492618083953857,
"rewards/format_reward": 0.97265625,
"step": 239
},
{
"all_correct": 0.25,
"all_wrong": 0.25,
"completion_length": 101.42578125,
"epoch": 0.45368620037807184,
"grad_norm": 3.1114222325682803,
"kl": 0.04541015625,
"learning_rate": 1.1449862668207732e-06,
"loss": 0.0018,
"reward": 1.4496355056762695,
"reward_std": 0.19880539178848267,
"rewards/accuracy_reward": 0.46916675567626953,
"rewards/format_reward": 0.98046875,
"step": 240
},
{
"all_correct": 0.375,
"all_wrong": 0.21875,
"completion_length": 97.19140625,
"epoch": 0.4555765595463138,
"grad_norm": 1.8268906039007968,
"kl": 0.042724609375,
"learning_rate": 1.1391077568451115e-06,
"loss": 0.0017,
"reward": 1.6206369400024414,
"reward_std": 0.1697978675365448,
"rewards/accuracy_reward": 0.6284493803977966,
"rewards/format_reward": 0.9921875,
"step": 241
},
{
"all_correct": 0.5625,
"all_wrong": 0.125,
"completion_length": 83.81640625,
"epoch": 0.45746691871455575,
"grad_norm": 3.936269420804981,
"kl": 0.0439453125,
"learning_rate": 1.1332243407459938e-06,
"loss": 0.0018,
"reward": 1.7336182594299316,
"reward_std": 0.11902132630348206,
"rewards/accuracy_reward": 0.7336182594299316,
"rewards/format_reward": 1.0,
"step": 242
},
{
"all_correct": 0.4375,
"all_wrong": 0.15625,
"completion_length": 94.34765625,
"epoch": 0.45935727788279773,
"grad_norm": 2.638770141408573,
"kl": 0.048095703125,
"learning_rate": 1.1273362260227457e-06,
"loss": 0.0019,
"reward": 1.6561558246612549,
"reward_std": 0.16947349905967712,
"rewards/accuracy_reward": 0.6600620746612549,
"rewards/format_reward": 0.99609375,
"step": 243
},
{
"all_correct": 0.46875,
"all_wrong": 0.125,
"completion_length": 90.83984375,
"epoch": 0.4612476370510397,
"grad_norm": 1.8631283133654752,
"kl": 0.050048828125,
"learning_rate": 1.121443620340406e-06,
"loss": 0.002,
"reward": 1.65234375,
"reward_std": 0.19478288292884827,
"rewards/accuracy_reward": 0.68359375,
"rewards/format_reward": 0.96875,
"step": 244
},
{
"all_correct": 0.21875,
"all_wrong": 0.28125,
"completion_length": 97.4296875,
"epoch": 0.46313799621928164,
"grad_norm": 1.614309504872873,
"kl": 0.044189453125,
"learning_rate": 1.1155467315224037e-06,
"loss": 0.0018,
"reward": 1.4352020025253296,
"reward_std": 0.1766517162322998,
"rewards/accuracy_reward": 0.435202032327652,
"rewards/format_reward": 1.0,
"step": 245
},
{
"all_correct": 0.34375,
"all_wrong": 0.1875,
"completion_length": 91.640625,
"epoch": 0.46502835538752363,
"grad_norm": 1.6379757636235799,
"kl": 0.0478515625,
"learning_rate": 1.1096457675432264e-06,
"loss": 0.0019,
"reward": 1.5429686307907104,
"reward_std": 0.14759376645088196,
"rewards/accuracy_reward": 0.5429686307907104,
"rewards/format_reward": 1.0,
"step": 246
},
{
"all_correct": 0.34375,
"all_wrong": 0.0625,
"completion_length": 91.984375,
"epoch": 0.4669187145557656,
"grad_norm": 2.5291223401978296,
"kl": 0.044921875,
"learning_rate": 1.1037409365210879e-06,
"loss": 0.0018,
"reward": 1.6242541074752808,
"reward_std": 0.2227800339460373,
"rewards/accuracy_reward": 0.6281603574752808,
"rewards/format_reward": 0.99609375,
"step": 247
},
{
"all_correct": 0.40625,
"all_wrong": 0.1875,
"completion_length": 95.96875,
"epoch": 0.46880907372400754,
"grad_norm": 1.5228770177382556,
"kl": 0.046630859375,
"learning_rate": 1.0978324467105857e-06,
"loss": 0.0019,
"reward": 1.575097680091858,
"reward_std": 0.1790701001882553,
"rewards/accuracy_reward": 0.5790039300918579,
"rewards/format_reward": 0.99609375,
"step": 248
},
{
"all_correct": 0.28125,
"all_wrong": 0.21875,
"completion_length": 91.23828125,
"epoch": 0.4706994328922495,
"grad_norm": 2.489169729265948,
"kl": 0.053466796875,
"learning_rate": 1.0919205064953581e-06,
"loss": 0.0021,
"reward": 1.5097450017929077,
"reward_std": 0.20132069289684296,
"rewards/accuracy_reward": 0.5097450017929077,
"rewards/format_reward": 1.0,
"step": 249
},
{
"all_correct": 0.34375,
"all_wrong": 0.28125,
"completion_length": 85.8046875,
"epoch": 0.4725897920604915,
"grad_norm": 2.586228799524351,
"kl": 0.0498046875,
"learning_rate": 1.0860053243807336e-06,
"loss": 0.002,
"reward": 1.514784574508667,
"reward_std": 0.14834506809711456,
"rewards/accuracy_reward": 0.514784574508667,
"rewards/format_reward": 1.0,
"step": 250
},
{
"all_correct": 0.28125,
"all_wrong": 0.15625,
"completion_length": 95.62109375,
"epoch": 0.47448015122873344,
"grad_norm": 1.713585099464109,
"kl": 0.042236328125,
"learning_rate": 1.0800871089863784e-06,
"loss": 0.0017,
"reward": 1.586524248123169,
"reward_std": 0.20916607975959778,
"rewards/accuracy_reward": 0.586524248123169,
"rewards/format_reward": 1.0,
"step": 251
},
{
"all_correct": 0.4375,
"all_wrong": 0.21875,
"completion_length": 84.66015625,
"epoch": 0.4763705103969754,
"grad_norm": 2.02730635206175,
"kl": 0.052490234375,
"learning_rate": 1.0741660690389365e-06,
"loss": 0.0021,
"reward": 1.6193575859069824,
"reward_std": 0.1369372010231018,
"rewards/accuracy_reward": 0.6193576455116272,
"rewards/format_reward": 1.0,
"step": 252
},
{
"all_correct": 0.34375,
"all_wrong": 0.25,
"completion_length": 94.953125,
"epoch": 0.4782608695652174,
"grad_norm": 2.0450185647249373,
"kl": 0.04541015625,
"learning_rate": 1.068242413364671e-06,
"loss": 0.0018,
"reward": 1.5561109781265259,
"reward_std": 0.18579518795013428,
"rewards/accuracy_reward": 0.5561109781265259,
"rewards/format_reward": 1.0,
"step": 253
},
{
"all_correct": 0.5,
"all_wrong": 0.21875,
"completion_length": 86.5078125,
"epoch": 0.48015122873345933,
"grad_norm": 3.4092659487344292,
"kl": 0.050048828125,
"learning_rate": 1.0623163508820976e-06,
"loss": 0.002,
"reward": 1.5766924619674683,
"reward_std": 0.10343727469444275,
"rewards/accuracy_reward": 0.5766924619674683,
"rewards/format_reward": 1.0,
"step": 254
},
{
"all_correct": 0.46875,
"all_wrong": 0.15625,
"completion_length": 89.734375,
"epoch": 0.4820415879017013,
"grad_norm": 1.4529133237212937,
"kl": 0.0517578125,
"learning_rate": 1.0563880905946158e-06,
"loss": 0.0021,
"reward": 1.6530107259750366,
"reward_std": 0.17242193222045898,
"rewards/accuracy_reward": 0.6530107259750366,
"rewards/format_reward": 1.0,
"step": 255
},
{
"all_correct": 0.4375,
"all_wrong": 0.09375,
"completion_length": 85.3359375,
"epoch": 0.4839319470699433,
"grad_norm": 1.6167911610708365,
"kl": 0.0478515625,
"learning_rate": 1.0504578415831394e-06,
"loss": 0.0019,
"reward": 1.7061023712158203,
"reward_std": 0.1580139398574829,
"rewards/accuracy_reward": 0.7061024904251099,
"rewards/format_reward": 1.0,
"step": 256
},
{
"all_correct": 0.40625,
"all_wrong": 0.1875,
"completion_length": 89.16015625,
"epoch": 0.48582230623818523,
"grad_norm": 3.1031477511238963,
"kl": 0.051513671875,
"learning_rate": 1.0445258129987204e-06,
"loss": 0.0021,
"reward": 1.5749967098236084,
"reward_std": 0.12671510875225067,
"rewards/accuracy_reward": 0.5749967098236084,
"rewards/format_reward": 1.0,
"step": 257
},
{
"all_correct": 0.3125,
"all_wrong": 0.21875,
"completion_length": 97.05078125,
"epoch": 0.4877126654064272,
"grad_norm": 1.587621420217378,
"kl": 0.0478515625,
"learning_rate": 1.0385922140551751e-06,
"loss": 0.0019,
"reward": 1.5610603094100952,
"reward_std": 0.15413255989551544,
"rewards/accuracy_reward": 0.5610603094100952,
"rewards/format_reward": 1.0,
"step": 258
},
{
"all_correct": 0.40625,
"all_wrong": 0.1875,
"completion_length": 92.26171875,
"epoch": 0.4896030245746692,
"grad_norm": 1.4717842344274057,
"kl": 0.05078125,
"learning_rate": 1.0326572540217027e-06,
"loss": 0.002,
"reward": 1.5245153903961182,
"reward_std": 0.1461203396320343,
"rewards/accuracy_reward": 0.5245153307914734,
"rewards/format_reward": 1.0,
"step": 259
},
{
"all_correct": 0.59375,
"all_wrong": 0.1875,
"completion_length": 94.67578125,
"epoch": 0.4914933837429111,
"grad_norm": 1.2872025757031114,
"kl": 0.048828125,
"learning_rate": 1.026721142215507e-06,
"loss": 0.002,
"reward": 1.6810582876205444,
"reward_std": 0.0886523649096489,
"rewards/accuracy_reward": 0.6810582876205444,
"rewards/format_reward": 1.0,
"step": 260
},
{
"all_correct": 0.375,
"all_wrong": 0.1875,
"completion_length": 88.6640625,
"epoch": 0.4933837429111531,
"grad_norm": 2.8413160569475533,
"kl": 0.04931640625,
"learning_rate": 1.0207840879944122e-06,
"loss": 0.002,
"reward": 1.634856104850769,
"reward_std": 0.16094039380550385,
"rewards/accuracy_reward": 0.634856104850769,
"rewards/format_reward": 1.0,
"step": 261
},
{
"all_correct": 0.34375,
"all_wrong": 0.3125,
"completion_length": 92.28515625,
"epoch": 0.4952741020793951,
"grad_norm": 1.3820672801691911,
"kl": 0.046875,
"learning_rate": 1.014846300749481e-06,
"loss": 0.0019,
"reward": 1.5555245876312256,
"reward_std": 0.14345505833625793,
"rewards/accuracy_reward": 0.5555245876312256,
"rewards/format_reward": 1.0,
"step": 262
},
{
"all_correct": 0.375,
"all_wrong": 0.09375,
"completion_length": 90.9765625,
"epoch": 0.497164461247637,
"grad_norm": 2.081869839864706,
"kl": 0.051025390625,
"learning_rate": 1.0089079898976282e-06,
"loss": 0.002,
"reward": 1.6466023921966553,
"reward_std": 0.1819521188735962,
"rewards/accuracy_reward": 0.6466023921966553,
"rewards/format_reward": 1.0,
"step": 263
},
{
"all_correct": 0.28125,
"all_wrong": 0.21875,
"completion_length": 94.72265625,
"epoch": 0.499054820415879,
"grad_norm": 2.580070748827961,
"kl": 0.044677734375,
"learning_rate": 1.0029693648742354e-06,
"loss": 0.0018,
"reward": 1.5194728374481201,
"reward_std": 0.21391144394874573,
"rewards/accuracy_reward": 0.5272853970527649,
"rewards/format_reward": 0.9921875,
"step": 264
},
{
"all_correct": 0.3125,
"all_wrong": 0.125,
"completion_length": 89.3515625,
"epoch": 0.500945179584121,
"grad_norm": 2.2976847333751,
"kl": 0.05224609375,
"learning_rate": 9.970306351257645e-07,
"loss": 0.0021,
"reward": 1.6085888147354126,
"reward_std": 0.23032766580581665,
"rewards/accuracy_reward": 0.6085888147354126,
"rewards/format_reward": 1.0,
"step": 265
},
{
"all_correct": 0.28125,
"all_wrong": 0.25,
"completion_length": 88.63671875,
"epoch": 0.502835538752363,
"grad_norm": 1.5748351110633412,
"kl": 0.0576171875,
"learning_rate": 9.910920101023717e-07,
"loss": 0.0023,
"reward": 1.4456298351287842,
"reward_std": 0.1709638237953186,
"rewards/accuracy_reward": 0.4456298351287842,
"rewards/format_reward": 1.0,
"step": 266
},
{
"all_correct": 0.34375,
"all_wrong": 0.125,
"completion_length": 91.58984375,
"epoch": 0.504725897920605,
"grad_norm": 2.770591512834709,
"kl": 0.0517578125,
"learning_rate": 9.851536992505187e-07,
"loss": 0.0021,
"reward": 1.653957724571228,
"reward_std": 0.20085959136486053,
"rewards/accuracy_reward": 0.653957724571228,
"rewards/format_reward": 1.0,
"step": 267
},
{
"all_correct": 0.34375,
"all_wrong": 0.15625,
"completion_length": 91.51953125,
"epoch": 0.5066162570888468,
"grad_norm": 1.6226350288665303,
"kl": 0.047119140625,
"learning_rate": 9.792159120055879e-07,
"loss": 0.0019,
"reward": 1.5789850950241089,
"reward_std": 0.17338192462921143,
"rewards/accuracy_reward": 0.5789849758148193,
"rewards/format_reward": 1.0,
"step": 268
},
{
"all_correct": 0.28125,
"all_wrong": 0.34375,
"completion_length": 93.02734375,
"epoch": 0.5085066162570888,
"grad_norm": 1.2047460673900767,
"kl": 0.051513671875,
"learning_rate": 9.732788577844932e-07,
"loss": 0.0021,
"reward": 1.5021183490753174,
"reward_std": 0.14113232493400574,
"rewards/accuracy_reward": 0.5021182894706726,
"rewards/format_reward": 1.0,
"step": 269
},
{
"all_correct": 0.375,
"all_wrong": 0.15625,
"completion_length": 86.44140625,
"epoch": 0.5103969754253308,
"grad_norm": 3.9170636810231727,
"kl": 0.050537109375,
"learning_rate": 9.673427459782974e-07,
"loss": 0.002,
"reward": 1.5727362632751465,
"reward_std": 0.18539920449256897,
"rewards/accuracy_reward": 0.5727362036705017,
"rewards/format_reward": 1.0,
"step": 270
},
{
"all_correct": 0.4375,
"all_wrong": 0.1875,
"completion_length": 87.015625,
"epoch": 0.5122873345935728,
"grad_norm": 1.366883049982446,
"kl": 0.05078125,
"learning_rate": 9.61407785944825e-07,
"loss": 0.002,
"reward": 1.6010971069335938,
"reward_std": 0.1428610235452652,
"rewards/accuracy_reward": 0.6050034761428833,
"rewards/format_reward": 0.99609375,
"step": 271
},
{
"all_correct": 0.25,
"all_wrong": 0.28125,
"completion_length": 90.48046875,
"epoch": 0.5141776937618148,
"grad_norm": 3.3253184693080673,
"kl": 0.053955078125,
"learning_rate": 9.554741870012795e-07,
"loss": 0.0022,
"reward": 1.478670358657837,
"reward_std": 0.16775619983673096,
"rewards/accuracy_reward": 0.4786703884601593,
"rewards/format_reward": 1.0,
"step": 272
},
{
"all_correct": 0.375,
"all_wrong": 0.21875,
"completion_length": 86.51171875,
"epoch": 0.5160680529300568,
"grad_norm": 1.549632672045942,
"kl": 0.048095703125,
"learning_rate": 9.495421584168608e-07,
"loss": 0.0019,
"reward": 1.594543695449829,
"reward_std": 0.18858283758163452,
"rewards/accuracy_reward": 0.5984500050544739,
"rewards/format_reward": 0.99609375,
"step": 273
},
{
"all_correct": 0.25,
"all_wrong": 0.25,
"completion_length": 84.9453125,
"epoch": 0.5179584120982986,
"grad_norm": 2.314889573705669,
"kl": 0.050048828125,
"learning_rate": 9.436119094053845e-07,
"loss": 0.002,
"reward": 1.5329444408416748,
"reward_std": 0.19728565216064453,
"rewards/accuracy_reward": 0.5329445004463196,
"rewards/format_reward": 1.0,
"step": 274
},
{
"all_correct": 0.375,
"all_wrong": 0.09375,
"completion_length": 91.96484375,
"epoch": 0.5198487712665406,
"grad_norm": 3.0057549499090004,
"kl": 0.046142578125,
"learning_rate": 9.376836491179027e-07,
"loss": 0.0018,
"reward": 1.6376956701278687,
"reward_std": 0.187890887260437,
"rewards/accuracy_reward": 0.6376956701278687,
"rewards/format_reward": 1.0,
"step": 275
},
{
"all_correct": 0.34375,
"all_wrong": 0.25,
"completion_length": 83.40234375,
"epoch": 0.5217391304347826,
"grad_norm": 2.381324526997287,
"kl": 0.04931640625,
"learning_rate": 9.317575866353291e-07,
"loss": 0.002,
"reward": 1.5451953411102295,
"reward_std": 0.15440954267978668,
"rewards/accuracy_reward": 0.5451953411102295,
"rewards/format_reward": 1.0,
"step": 276
},
{
"all_correct": 0.25,
"all_wrong": 0.1875,
"completion_length": 93.62109375,
"epoch": 0.5236294896030246,
"grad_norm": 1.6038678148688907,
"kl": 0.046630859375,
"learning_rate": 9.258339309610636e-07,
"loss": 0.0019,
"reward": 1.5747730731964111,
"reward_std": 0.24761907756328583,
"rewards/accuracy_reward": 0.5786792635917664,
"rewards/format_reward": 0.99609375,
"step": 277
},
{
"all_correct": 0.21875,
"all_wrong": 0.21875,
"completion_length": 92.5625,
"epoch": 0.5255198487712666,
"grad_norm": 2.367357867898568,
"kl": 0.049072265625,
"learning_rate": 9.199128910136218e-07,
"loss": 0.002,
"reward": 1.4614596366882324,
"reward_std": 0.24768300354480743,
"rewards/accuracy_reward": 0.4653658866882324,
"rewards/format_reward": 0.99609375,
"step": 278
},
{
"all_correct": 0.375,
"all_wrong": 0.15625,
"completion_length": 89.54296875,
"epoch": 0.5274102079395085,
"grad_norm": 2.2126449160143147,
"kl": 0.043701171875,
"learning_rate": 9.139946756192662e-07,
"loss": 0.0018,
"reward": 1.5789459943771362,
"reward_std": 0.17482253909111023,
"rewards/accuracy_reward": 0.5789459943771362,
"rewards/format_reward": 1.0,
"step": 279
},
{
"all_correct": 0.25,
"all_wrong": 0.21875,
"completion_length": 91.51953125,
"epoch": 0.5293005671077504,
"grad_norm": 1.7672859098305684,
"kl": 0.051025390625,
"learning_rate": 9.08079493504642e-07,
"loss": 0.002,
"reward": 1.5148652791976929,
"reward_std": 0.21231761574745178,
"rewards/accuracy_reward": 0.5148652791976929,
"rewards/format_reward": 1.0,
"step": 280
},
{
"all_correct": 0.46875,
"all_wrong": 0.125,
"completion_length": 89.3828125,
"epoch": 0.5311909262759924,
"grad_norm": 1.4247234430692624,
"kl": 0.046875,
"learning_rate": 9.021675532894144e-07,
"loss": 0.0019,
"reward": 1.6473538875579834,
"reward_std": 0.14679107069969177,
"rewards/accuracy_reward": 0.6512601971626282,
"rewards/format_reward": 0.99609375,
"step": 281
},
{
"all_correct": 0.375,
"all_wrong": 0.1875,
"completion_length": 85.79296875,
"epoch": 0.5330812854442344,
"grad_norm": 1.3257824153478301,
"kl": 0.048828125,
"learning_rate": 8.962590634789123e-07,
"loss": 0.002,
"reward": 1.6150450706481934,
"reward_std": 0.1819513887166977,
"rewards/accuracy_reward": 0.6150450706481934,
"rewards/format_reward": 1.0,
"step": 282
},
{
"all_correct": 0.3125,
"all_wrong": 0.1875,
"completion_length": 85.34375,
"epoch": 0.5349716446124764,
"grad_norm": 1.6854728753752273,
"kl": 0.05078125,
"learning_rate": 8.903542324567735e-07,
"loss": 0.002,
"reward": 1.5195235013961792,
"reward_std": 0.20223002135753632,
"rewards/accuracy_reward": 0.5390547513961792,
"rewards/format_reward": 0.98046875,
"step": 283
},
{
"all_correct": 0.34375,
"all_wrong": 0.15625,
"completion_length": 91.28515625,
"epoch": 0.5368620037807184,
"grad_norm": 1.8405481449092091,
"kl": 0.05419921875,
"learning_rate": 8.844532684775963e-07,
"loss": 0.0022,
"reward": 1.592590093612671,
"reward_std": 0.1902393102645874,
"rewards/accuracy_reward": 0.5925900340080261,
"rewards/format_reward": 1.0,
"step": 284
},
{
"all_correct": 0.34375,
"all_wrong": 0.15625,
"completion_length": 92.78125,
"epoch": 0.5387523629489603,
"grad_norm": 3.7950521224218687,
"kl": 0.044189453125,
"learning_rate": 8.785563796595938e-07,
"loss": 0.0018,
"reward": 1.6031997203826904,
"reward_std": 0.19717274606227875,
"rewards/accuracy_reward": 0.6031997203826904,
"rewards/format_reward": 1.0,
"step": 285
},
{
"all_correct": 0.46875,
"all_wrong": 0.25,
"completion_length": 81.3046875,
"epoch": 0.5406427221172023,
"grad_norm": 2.3590193160665893,
"kl": 0.061767578125,
"learning_rate": 8.726637739772541e-07,
"loss": 0.0025,
"reward": 1.675480842590332,
"reward_std": 0.10290536284446716,
"rewards/accuracy_reward": 0.6754807829856873,
"rewards/format_reward": 1.0,
"step": 286
},
{
"all_correct": 0.40625,
"all_wrong": 0.28125,
"completion_length": 84.125,
"epoch": 0.5425330812854442,
"grad_norm": 1.2126067200747002,
"kl": 0.055908203125,
"learning_rate": 8.667756592540063e-07,
"loss": 0.0022,
"reward": 1.5611882209777832,
"reward_std": 0.11617802083492279,
"rewards/accuracy_reward": 0.5611881017684937,
"rewards/format_reward": 1.0,
"step": 287
},
{
"all_correct": 0.375,
"all_wrong": 0.15625,
"completion_length": 89.6796875,
"epoch": 0.5444234404536862,
"grad_norm": 1.966929179296431,
"kl": 0.045166015625,
"learning_rate": 8.608922431548887e-07,
"loss": 0.0018,
"reward": 1.6376736164093018,
"reward_std": 0.18406596779823303,
"rewards/accuracy_reward": 0.6376736760139465,
"rewards/format_reward": 1.0,
"step": 288
},
{
"all_correct": 0.34375,
"all_wrong": 0.125,
"completion_length": 94.79296875,
"epoch": 0.5463137996219282,
"grad_norm": 1.756150509577632,
"kl": 0.0400390625,
"learning_rate": 8.550137331792269e-07,
"loss": 0.0016,
"reward": 1.6595051288604736,
"reward_std": 0.24478332698345184,
"rewards/accuracy_reward": 0.6673176884651184,
"rewards/format_reward": 0.9921875,
"step": 289
},
{
"all_correct": 0.25,
"all_wrong": 0.34375,
"completion_length": 87.40625,
"epoch": 0.5482041587901701,
"grad_norm": 1.9775373962167517,
"kl": 0.052978515625,
"learning_rate": 8.49140336653315e-07,
"loss": 0.0021,
"reward": 1.4470252990722656,
"reward_std": 0.17483514547348022,
"rewards/accuracy_reward": 0.4470253586769104,
"rewards/format_reward": 1.0,
"step": 290
},
{
"all_correct": 0.40625,
"all_wrong": 0.125,
"completion_length": 88.69921875,
"epoch": 0.5500945179584121,
"grad_norm": 2.063086280742146,
"kl": 0.051025390625,
"learning_rate": 8.432722607231029e-07,
"loss": 0.002,
"reward": 1.6172977685928345,
"reward_std": 0.20396284759044647,
"rewards/accuracy_reward": 0.6172977685928345,
"rewards/format_reward": 1.0,
"step": 291
},
{
"all_correct": 0.28125,
"all_wrong": 0.0625,
"completion_length": 99.53515625,
"epoch": 0.5519848771266541,
"grad_norm": 2.621542277949528,
"kl": 0.04150390625,
"learning_rate": 8.374097123468917e-07,
"loss": 0.0017,
"reward": 1.5667483806610107,
"reward_std": 0.24786125123500824,
"rewards/accuracy_reward": 0.5706546306610107,
"rewards/format_reward": 0.99609375,
"step": 292
},
{
"all_correct": 0.28125,
"all_wrong": 0.15625,
"completion_length": 86.796875,
"epoch": 0.553875236294896,
"grad_norm": 2.8165649295162316,
"kl": 0.048095703125,
"learning_rate": 8.315528982880337e-07,
"loss": 0.0019,
"reward": 1.5577614307403564,
"reward_std": 0.25827598571777344,
"rewards/accuracy_reward": 0.5655738711357117,
"rewards/format_reward": 0.9921875,
"step": 293
},
{
"all_correct": 0.28125,
"all_wrong": 0.1875,
"completion_length": 87.5390625,
"epoch": 0.555765595463138,
"grad_norm": 2.0837535994437473,
"kl": 0.0439453125,
"learning_rate": 8.257020251076392e-07,
"loss": 0.0018,
"reward": 1.5403378009796143,
"reward_std": 0.2276870459318161,
"rewards/accuracy_reward": 0.5403377413749695,
"rewards/format_reward": 1.0,
"step": 294
},
{
"all_correct": 0.34375,
"all_wrong": 0.15625,
"completion_length": 93.94921875,
"epoch": 0.55765595463138,
"grad_norm": 3.9566867940245594,
"kl": 0.05029296875,
"learning_rate": 8.198572991572939e-07,
"loss": 0.002,
"reward": 1.6211934089660645,
"reward_std": 0.21309423446655273,
"rewards/accuracy_reward": 0.6290059089660645,
"rewards/format_reward": 0.9921875,
"step": 295
},
{
"all_correct": 0.21875,
"all_wrong": 0.28125,
"completion_length": 100.11328125,
"epoch": 0.5595463137996219,
"grad_norm": 1.6039871922459095,
"kl": 0.046142578125,
"learning_rate": 8.140189265717793e-07,
"loss": 0.0018,
"reward": 1.3850700855255127,
"reward_std": 0.21388718485832214,
"rewards/accuracy_reward": 0.42413264513015747,
"rewards/format_reward": 0.9609375,
"step": 296
},
{
"all_correct": 0.34375,
"all_wrong": 0.34375,
"completion_length": 92.2734375,
"epoch": 0.5614366729678639,
"grad_norm": 1.4495763345559223,
"kl": 0.05126953125,
"learning_rate": 8.081871132618035e-07,
"loss": 0.0021,
"reward": 1.4881727695465088,
"reward_std": 0.1367965191602707,
"rewards/accuracy_reward": 0.4920789897441864,
"rewards/format_reward": 0.99609375,
"step": 297
},
{
"all_correct": 0.40625,
"all_wrong": 0.09375,
"completion_length": 78.7109375,
"epoch": 0.5633270321361059,
"grad_norm": 2.195186142439845,
"kl": 0.054443359375,
"learning_rate": 8.023620649067383e-07,
"loss": 0.0022,
"reward": 1.6418862342834473,
"reward_std": 0.17705166339874268,
"rewards/accuracy_reward": 0.6418863534927368,
"rewards/format_reward": 1.0,
"step": 298
},
{
"all_correct": 0.3125,
"all_wrong": 0.25,
"completion_length": 95.70703125,
"epoch": 0.5652173913043478,
"grad_norm": 1.5458362887848978,
"kl": 0.044677734375,
"learning_rate": 7.965439869473663e-07,
"loss": 0.0018,
"reward": 1.5705434083938599,
"reward_std": 0.1792255938053131,
"rewards/accuracy_reward": 0.5705434083938599,
"rewards/format_reward": 1.0,
"step": 299
},
{
"all_correct": 0.375,
"all_wrong": 0.25,
"completion_length": 91.9296875,
"epoch": 0.5671077504725898,
"grad_norm": 2.0503603995341266,
"kl": 0.04541015625,
"learning_rate": 7.907330845786337e-07,
"loss": 0.0018,
"reward": 1.5330439805984497,
"reward_std": 0.1628941148519516,
"rewards/accuracy_reward": 0.5564814805984497,
"rewards/format_reward": 0.9765625,
"step": 300
},
{
"all_correct": 0.40625,
"all_wrong": 0.15625,
"completion_length": 86.5078125,
"epoch": 0.5689981096408318,
"grad_norm": 1.4513016708834336,
"kl": 0.0478515625,
"learning_rate": 7.849295627424147e-07,
"loss": 0.0019,
"reward": 1.6002389192581177,
"reward_std": 0.16803400218486786,
"rewards/accuracy_reward": 0.6158639192581177,
"rewards/format_reward": 0.984375,
"step": 301
},
{
"all_correct": 0.40625,
"all_wrong": 0.15625,
"completion_length": 85.36328125,
"epoch": 0.5708884688090737,
"grad_norm": 2.0386539297024187,
"kl": 0.050537109375,
"learning_rate": 7.791336261202834e-07,
"loss": 0.002,
"reward": 1.6394249200820923,
"reward_std": 0.18532907962799072,
"rewards/accuracy_reward": 0.6550499200820923,
"rewards/format_reward": 0.984375,
"step": 302
},
{
"all_correct": 0.25,
"all_wrong": 0.25,
"completion_length": 91.37109375,
"epoch": 0.5727788279773157,
"grad_norm": 1.7416561634122243,
"kl": 0.047119140625,
"learning_rate": 7.733454791262945e-07,
"loss": 0.0019,
"reward": 1.5273735523223877,
"reward_std": 0.1830909550189972,
"rewards/accuracy_reward": 0.5273735523223877,
"rewards/format_reward": 1.0,
"step": 303
},
{
"all_correct": 0.34375,
"all_wrong": 0.1875,
"completion_length": 93.0390625,
"epoch": 0.5746691871455577,
"grad_norm": 1.4257574938910071,
"kl": 0.04931640625,
"learning_rate": 7.67565325899774e-07,
"loss": 0.002,
"reward": 1.6209030151367188,
"reward_std": 0.1880435049533844,
"rewards/accuracy_reward": 0.6209030747413635,
"rewards/format_reward": 1.0,
"step": 304
},
{
"all_correct": 0.375,
"all_wrong": 0.09375,
"completion_length": 94.640625,
"epoch": 0.5765595463137996,
"grad_norm": 2.523907663059568,
"kl": 0.0478515625,
"learning_rate": 7.617933702981197e-07,
"loss": 0.0019,
"reward": 1.646308183670044,
"reward_std": 0.20141802728176117,
"rewards/accuracy_reward": 0.650214433670044,
"rewards/format_reward": 0.99609375,
"step": 305
},
{
"all_correct": 0.3125,
"all_wrong": 0.28125,
"completion_length": 88.98828125,
"epoch": 0.5784499054820416,
"grad_norm": 1.8869992601885726,
"kl": 0.052978515625,
"learning_rate": 7.560298158896114e-07,
"loss": 0.0021,
"reward": 1.5234375,
"reward_std": 0.19467194378376007,
"rewards/accuracy_reward": 0.52734375,
"rewards/format_reward": 0.99609375,
"step": 306
},
{
"all_correct": 0.4375,
"all_wrong": 0.125,
"completion_length": 91.984375,
"epoch": 0.5803402646502835,
"grad_norm": 1.593500363249208,
"kl": 0.050537109375,
"learning_rate": 7.50274865946231e-07,
"loss": 0.002,
"reward": 1.6522129774093628,
"reward_std": 0.15789154171943665,
"rewards/accuracy_reward": 0.6522129774093628,
"rewards/format_reward": 1.0,
"step": 307
},
{
"all_correct": 0.375,
"all_wrong": 0.125,
"completion_length": 89.9140625,
"epoch": 0.5822306238185255,
"grad_norm": 1.5345766665959373,
"kl": 0.046630859375,
"learning_rate": 7.445287234364945e-07,
"loss": 0.0019,
"reward": 1.6170084476470947,
"reward_std": 0.1939898431301117,
"rewards/accuracy_reward": 0.61700838804245,
"rewards/format_reward": 1.0,
"step": 308
},
{
"all_correct": 0.25,
"all_wrong": 0.25,
"completion_length": 91.09375,
"epoch": 0.5841209829867675,
"grad_norm": 1.633714349616306,
"kl": 0.04541015625,
"learning_rate": 7.38791591018292e-07,
"loss": 0.0018,
"reward": 1.5211251974105835,
"reward_std": 0.19408775866031647,
"rewards/accuracy_reward": 0.5367502570152283,
"rewards/format_reward": 0.984375,
"step": 309
},
{
"all_correct": 0.375,
"all_wrong": 0.1875,
"completion_length": 94.28515625,
"epoch": 0.5860113421550095,
"grad_norm": 1.7832966629730098,
"kl": 0.044677734375,
"learning_rate": 7.330636710317417e-07,
"loss": 0.0018,
"reward": 1.6072568893432617,
"reward_std": 0.1647563874721527,
"rewards/accuracy_reward": 0.6189756989479065,
"rewards/format_reward": 0.98828125,
"step": 310
},
{
"all_correct": 0.375,
"all_wrong": 0.125,
"completion_length": 89.41796875,
"epoch": 0.5879017013232514,
"grad_norm": 2.1849079934738294,
"kl": 0.047607421875,
"learning_rate": 7.27345165492053e-07,
"loss": 0.0019,
"reward": 1.5788297653198242,
"reward_std": 0.18523138761520386,
"rewards/accuracy_reward": 0.5788298845291138,
"rewards/format_reward": 1.0,
"step": 311
},
{
"all_correct": 0.375,
"all_wrong": 0.1875,
"completion_length": 98.234375,
"epoch": 0.5897920604914934,
"grad_norm": 1.432954670732317,
"kl": 0.044189453125,
"learning_rate": 7.216362760824009e-07,
"loss": 0.0018,
"reward": 1.6121280193328857,
"reward_std": 0.17768144607543945,
"rewards/accuracy_reward": 0.612127959728241,
"rewards/format_reward": 1.0,
"step": 312
},
{
"all_correct": 0.46875,
"all_wrong": 0.125,
"completion_length": 85.05078125,
"epoch": 0.5916824196597353,
"grad_norm": 1.7095977165647167,
"kl": 0.0517578125,
"learning_rate": 7.159372041468149e-07,
"loss": 0.0021,
"reward": 1.6863864660263062,
"reward_std": 0.14531907439231873,
"rewards/accuracy_reward": 0.6863864660263062,
"rewards/format_reward": 1.0,
"step": 313
},
{
"all_correct": 0.25,
"all_wrong": 0.21875,
"completion_length": 89.55078125,
"epoch": 0.5935727788279773,
"grad_norm": 1.7588812930279742,
"kl": 0.0439453125,
"learning_rate": 7.102481506830763e-07,
"loss": 0.0018,
"reward": 1.4836997985839844,
"reward_std": 0.21439874172210693,
"rewards/accuracy_reward": 0.48369988799095154,
"rewards/format_reward": 1.0,
"step": 314
},
{
"all_correct": 0.28125,
"all_wrong": 0.28125,
"completion_length": 93.41015625,
"epoch": 0.5954631379962193,
"grad_norm": 2.719325851548355,
"kl": 0.04052734375,
"learning_rate": 7.045693163356299e-07,
"loss": 0.0016,
"reward": 1.5147807598114014,
"reward_std": 0.16246028244495392,
"rewards/accuracy_reward": 0.5147807002067566,
"rewards/format_reward": 1.0,
"step": 315
},
{
"all_correct": 0.21875,
"all_wrong": 0.28125,
"completion_length": 101.6171875,
"epoch": 0.5973534971644613,
"grad_norm": 12.827054251723668,
"kl": 0.046875,
"learning_rate": 6.989009013885076e-07,
"loss": 0.0019,
"reward": 1.442307710647583,
"reward_std": 0.24046628177165985,
"rewards/accuracy_reward": 0.457932710647583,
"rewards/format_reward": 0.984375,
"step": 316
},
{
"all_correct": 0.46875,
"all_wrong": 0.1875,
"completion_length": 87.93359375,
"epoch": 0.5992438563327032,
"grad_norm": 1.6360722613389922,
"kl": 0.055908203125,
"learning_rate": 6.932431057582646e-07,
"loss": 0.0022,
"reward": 1.6484375,
"reward_std": 0.16018126904964447,
"rewards/accuracy_reward": 0.6484375,
"rewards/format_reward": 1.0,
"step": 317
},
{
"all_correct": 0.3125,
"all_wrong": 0.1875,
"completion_length": 104.6484375,
"epoch": 0.6011342155009451,
"grad_norm": 1.8443996597658288,
"kl": 0.04345703125,
"learning_rate": 6.875961289869282e-07,
"loss": 0.0017,
"reward": 1.5236172676086426,
"reward_std": 0.20754508674144745,
"rewards/accuracy_reward": 0.5236173272132874,
"rewards/format_reward": 1.0,
"step": 318
},
{
"all_correct": 0.4375,
"all_wrong": 0.15625,
"completion_length": 83.109375,
"epoch": 0.6030245746691871,
"grad_norm": 2.136504314601804,
"kl": 0.051025390625,
"learning_rate": 6.819601702349608e-07,
"loss": 0.002,
"reward": 1.6519629955291748,
"reward_std": 0.15085574984550476,
"rewards/accuracy_reward": 0.6519629955291748,
"rewards/format_reward": 1.0,
"step": 319
},
{
"all_correct": 0.46875,
"all_wrong": 0.0625,
"completion_length": 93.57421875,
"epoch": 0.6049149338374291,
"grad_norm": 1.4937943209360285,
"kl": 0.042236328125,
"learning_rate": 6.763354282742362e-07,
"loss": 0.0017,
"reward": 1.628268837928772,
"reward_std": 0.17435705661773682,
"rewards/accuracy_reward": 0.636081337928772,
"rewards/format_reward": 0.9921875,
"step": 320
},
{
"all_correct": 0.34375,
"all_wrong": 0.15625,
"completion_length": 90.71875,
"epoch": 0.6068052930056711,
"grad_norm": 8.501242318953185,
"kl": 0.051513671875,
"learning_rate": 6.707221014810278e-07,
"loss": 0.0021,
"reward": 1.5977280139923096,
"reward_std": 0.18674521148204803,
"rewards/accuracy_reward": 0.59772789478302,
"rewards/format_reward": 1.0,
"step": 321
},
{
"all_correct": 0.53125,
"all_wrong": 0.15625,
"completion_length": 100.10546875,
"epoch": 0.6086956521739131,
"grad_norm": 1.126898780019968,
"kl": 0.046875,
"learning_rate": 6.651203878290138e-07,
"loss": 0.0019,
"reward": 1.6740057468414307,
"reward_std": 0.13990236818790436,
"rewards/accuracy_reward": 0.6740056872367859,
"rewards/format_reward": 1.0,
"step": 322
},
{
"all_correct": 0.5,
"all_wrong": 0.25,
"completion_length": 87.5625,
"epoch": 0.610586011342155,
"grad_norm": 1.0829161392709314,
"kl": 0.0537109375,
"learning_rate": 6.59530484882294e-07,
"loss": 0.0021,
"reward": 1.6339197158813477,
"reward_std": 0.10196228325366974,
"rewards/accuracy_reward": 0.6339195966720581,
"rewards/format_reward": 1.0,
"step": 323
},
{
"all_correct": 0.40625,
"all_wrong": 0.21875,
"completion_length": 85.04296875,
"epoch": 0.6124763705103969,
"grad_norm": 1.5071308132082326,
"kl": 0.048095703125,
"learning_rate": 6.539525897884218e-07,
"loss": 0.0019,
"reward": 1.601670503616333,
"reward_std": 0.11899926513433456,
"rewards/accuracy_reward": 0.6016704440116882,
"rewards/format_reward": 1.0,
"step": 324
},
{
"all_correct": 0.28125,
"all_wrong": 0.25,
"completion_length": 105.0546875,
"epoch": 0.6143667296786389,
"grad_norm": 1.3796535779838177,
"kl": 0.04638671875,
"learning_rate": 6.48386899271452e-07,
"loss": 0.0019,
"reward": 1.5369006395339966,
"reward_std": 0.1829143464565277,
"rewards/accuracy_reward": 0.5486193895339966,
"rewards/format_reward": 0.98828125,
"step": 325
},
{
"all_correct": 0.53125,
"all_wrong": 0.21875,
"completion_length": 91.96875,
"epoch": 0.6162570888468809,
"grad_norm": 1.0665567955819852,
"kl": 0.043701171875,
"learning_rate": 6.428336096250017e-07,
"loss": 0.0018,
"reward": 1.632015585899353,
"reward_std": 0.08141334354877472,
"rewards/accuracy_reward": 0.6320155262947083,
"rewards/format_reward": 1.0,
"step": 326
},
{
"all_correct": 0.3125,
"all_wrong": 0.15625,
"completion_length": 98.70703125,
"epoch": 0.6181474480151229,
"grad_norm": 1.8190601838814684,
"kl": 0.0498046875,
"learning_rate": 6.372929167053285e-07,
"loss": 0.002,
"reward": 1.5398609638214111,
"reward_std": 0.19976115226745605,
"rewards/accuracy_reward": 0.5398609638214111,
"rewards/format_reward": 1.0,
"step": 327
},
{
"all_correct": 0.46875,
"all_wrong": 0.09375,
"completion_length": 96.3984375,
"epoch": 0.6200378071833649,
"grad_norm": 10.48494019650004,
"kl": 0.05029296875,
"learning_rate": 6.317650159244212e-07,
"loss": 0.002,
"reward": 1.611169457435608,
"reward_std": 0.17813417315483093,
"rewards/accuracy_reward": 0.6150757074356079,
"rewards/format_reward": 0.99609375,
"step": 328
},
{
"all_correct": 0.5,
"all_wrong": 0.125,
"completion_length": 90.19921875,
"epoch": 0.6219281663516069,
"grad_norm": 3.3256774582394724,
"kl": 0.05029296875,
"learning_rate": 6.262501022431099e-07,
"loss": 0.002,
"reward": 1.6589438915252686,
"reward_std": 0.12664872407913208,
"rewards/accuracy_reward": 0.6628501415252686,
"rewards/format_reward": 0.99609375,
"step": 329
},
{
"all_correct": 0.40625,
"all_wrong": 0.03125,
"completion_length": 100.25390625,
"epoch": 0.6238185255198487,
"grad_norm": 1.4299757860972675,
"kl": 0.04052734375,
"learning_rate": 6.207483701641887e-07,
"loss": 0.0016,
"reward": 1.7447913885116577,
"reward_std": 0.18475459516048431,
"rewards/accuracy_reward": 0.7447913885116577,
"rewards/format_reward": 1.0,
"step": 330
},
{
"all_correct": 0.34375,
"all_wrong": 0.375,
"completion_length": 91.765625,
"epoch": 0.6257088846880907,
"grad_norm": 1.1358781168676717,
"kl": 0.0478515625,
"learning_rate": 6.15260013725555e-07,
"loss": 0.0019,
"reward": 1.4526742696762085,
"reward_std": 0.11981412023305893,
"rewards/accuracy_reward": 0.4526742696762085,
"rewards/format_reward": 1.0,
"step": 331
},
{
"all_correct": 0.40625,
"all_wrong": 0.15625,
"completion_length": 98.609375,
"epoch": 0.6275992438563327,
"grad_norm": 2.5447540141563048,
"kl": 0.048828125,
"learning_rate": 6.097852264933696e-07,
"loss": 0.002,
"reward": 1.619698166847229,
"reward_std": 0.16045129299163818,
"rewards/accuracy_reward": 0.6275107860565186,
"rewards/format_reward": 0.9921875,
"step": 332
},
{
"all_correct": 0.3125,
"all_wrong": 0.25,
"completion_length": 91.7265625,
"epoch": 0.6294896030245747,
"grad_norm": 1.5705944493635686,
"kl": 0.0478515625,
"learning_rate": 6.043242015552256e-07,
"loss": 0.0019,
"reward": 1.52734375,
"reward_std": 0.21317726373672485,
"rewards/accuracy_reward": 0.52734375,
"rewards/format_reward": 1.0,
"step": 333
},
{
"all_correct": 0.375,
"all_wrong": 0.1875,
"completion_length": 93.20703125,
"epoch": 0.6313799621928167,
"grad_norm": 1.4532227455775275,
"kl": 0.039306640625,
"learning_rate": 5.988771315133417e-07,
"loss": 0.0016,
"reward": 1.5797886848449707,
"reward_std": 0.17210961878299713,
"rewards/accuracy_reward": 0.5797887444496155,
"rewards/format_reward": 1.0,
"step": 334
},
{
"all_correct": 0.40625,
"all_wrong": 0.15625,
"completion_length": 89.7421875,
"epoch": 0.6332703213610587,
"grad_norm": 1.5032464010247504,
"kl": 0.04150390625,
"learning_rate": 5.934442084777675e-07,
"loss": 0.0017,
"reward": 1.6588280200958252,
"reward_std": 0.17925553023815155,
"rewards/accuracy_reward": 0.6588280200958252,
"rewards/format_reward": 1.0,
"step": 335
},
{
"all_correct": 0.375,
"all_wrong": 0.21875,
"completion_length": 89.48046875,
"epoch": 0.6351606805293005,
"grad_norm": 3.9562339582094346,
"kl": 0.056640625,
"learning_rate": 5.880256240596095e-07,
"loss": 0.0023,
"reward": 1.5489052534103394,
"reward_std": 0.1494232714176178,
"rewards/accuracy_reward": 0.5489052534103394,
"rewards/format_reward": 1.0,
"step": 336
},
{
"all_correct": 0.40625,
"all_wrong": 0.21875,
"completion_length": 91.6171875,
"epoch": 0.6370510396975425,
"grad_norm": 4.504748528156339,
"kl": 0.0537109375,
"learning_rate": 5.826215693642709e-07,
"loss": 0.0021,
"reward": 1.574186086654663,
"reward_std": 0.14129537343978882,
"rewards/accuracy_reward": 0.5780923366546631,
"rewards/format_reward": 0.99609375,
"step": 337
},
{
"all_correct": 0.53125,
"all_wrong": 0.25,
"completion_length": 96.40234375,
"epoch": 0.6389413988657845,
"grad_norm": 0.9918255765496156,
"kl": 0.046875,
"learning_rate": 5.772322349847153e-07,
"loss": 0.0019,
"reward": 1.65234375,
"reward_std": 0.09954919666051865,
"rewards/accuracy_reward": 0.65234375,
"rewards/format_reward": 1.0,
"step": 338
},
{
"all_correct": 0.40625,
"all_wrong": 0.21875,
"completion_length": 90.94921875,
"epoch": 0.6408317580340265,
"grad_norm": 2.0285380237709387,
"kl": 0.050537109375,
"learning_rate": 5.718578109947409e-07,
"loss": 0.002,
"reward": 1.608215093612671,
"reward_std": 0.10201030969619751,
"rewards/accuracy_reward": 0.6082150340080261,
"rewards/format_reward": 1.0,
"step": 339
},
{
"all_correct": 0.3125,
"all_wrong": 0.21875,
"completion_length": 87.05078125,
"epoch": 0.6427221172022685,
"grad_norm": 3.3176011557910097,
"kl": 0.053466796875,
"learning_rate": 5.664984869422802e-07,
"loss": 0.0021,
"reward": 1.5531736612319946,
"reward_std": 0.17685286700725555,
"rewards/accuracy_reward": 0.5531736612319946,
"rewards/format_reward": 1.0,
"step": 340
},
{
"all_correct": 0.4375,
"all_wrong": 0.09375,
"completion_length": 100.0234375,
"epoch": 0.6446124763705104,
"grad_norm": 1.9481732665617166,
"kl": 0.049072265625,
"learning_rate": 5.611544518427121e-07,
"loss": 0.002,
"reward": 1.6415621042251587,
"reward_std": 0.163002610206604,
"rewards/accuracy_reward": 0.6493746042251587,
"rewards/format_reward": 0.9921875,
"step": 341
},
{
"all_correct": 0.34375,
"all_wrong": 0.21875,
"completion_length": 91.18359375,
"epoch": 0.6465028355387523,
"grad_norm": 1.9147871985701481,
"kl": 0.04931640625,
"learning_rate": 5.558258941721981e-07,
"loss": 0.002,
"reward": 1.5601630210876465,
"reward_std": 0.17341557145118713,
"rewards/accuracy_reward": 0.5640692710876465,
"rewards/format_reward": 0.99609375,
"step": 342
},
{
"all_correct": 0.40625,
"all_wrong": 0.09375,
"completion_length": 86.3515625,
"epoch": 0.6483931947069943,
"grad_norm": 2.2167075436448465,
"kl": 0.05126953125,
"learning_rate": 5.505130018610321e-07,
"loss": 0.002,
"reward": 1.6853443384170532,
"reward_std": 0.20850321650505066,
"rewards/accuracy_reward": 0.7009693384170532,
"rewards/format_reward": 0.984375,
"step": 343
},
{
"all_correct": 0.5,
"all_wrong": 0.1875,
"completion_length": 83.95703125,
"epoch": 0.6502835538752363,
"grad_norm": 2.499264160160712,
"kl": 0.055908203125,
"learning_rate": 5.452159622870157e-07,
"loss": 0.0022,
"reward": 1.6708264350891113,
"reward_std": 0.09797890484333038,
"rewards/accuracy_reward": 0.6708264350891113,
"rewards/format_reward": 1.0,
"step": 344
},
{
"all_correct": 0.375,
"all_wrong": 0.21875,
"completion_length": 98.9375,
"epoch": 0.6521739130434783,
"grad_norm": 1.1468871398817573,
"kl": 0.051025390625,
"learning_rate": 5.399349622688478e-07,
"loss": 0.002,
"reward": 1.5537773370742798,
"reward_std": 0.17344093322753906,
"rewards/accuracy_reward": 0.5772148370742798,
"rewards/format_reward": 0.9765625,
"step": 345
},
{
"all_correct": 0.4375,
"all_wrong": 0.1875,
"completion_length": 91.84765625,
"epoch": 0.6540642722117203,
"grad_norm": 1.365593331621954,
"kl": 0.0498046875,
"learning_rate": 5.346701880595353e-07,
"loss": 0.002,
"reward": 1.6378886699676514,
"reward_std": 0.14274653792381287,
"rewards/accuracy_reward": 0.6378886699676514,
"rewards/format_reward": 1.0,
"step": 346
},
{
"all_correct": 0.4375,
"all_wrong": 0.25,
"completion_length": 94.91015625,
"epoch": 0.6559546313799622,
"grad_norm": 1.5634515651271659,
"kl": 0.045654296875,
"learning_rate": 5.29421825339826e-07,
"loss": 0.0018,
"reward": 1.589550495147705,
"reward_std": 0.13411790132522583,
"rewards/accuracy_reward": 0.5973629951477051,
"rewards/format_reward": 0.9921875,
"step": 347
},
{
"all_correct": 0.4375,
"all_wrong": 0.25,
"completion_length": 90.8359375,
"epoch": 0.6578449905482041,
"grad_norm": 1.1226897308498045,
"kl": 0.0458984375,
"learning_rate": 5.241900592116579e-07,
"loss": 0.0018,
"reward": 1.5573174953460693,
"reward_std": 0.09218928962945938,
"rewards/accuracy_reward": 0.5573174953460693,
"rewards/format_reward": 1.0,
"step": 348
},
{
"all_correct": 0.375,
"all_wrong": 0.1875,
"completion_length": 94.4453125,
"epoch": 0.6597353497164461,
"grad_norm": 1.4290088468656539,
"kl": 0.051513671875,
"learning_rate": 5.189750741916326e-07,
"loss": 0.0021,
"reward": 1.623161792755127,
"reward_std": 0.20275253057479858,
"rewards/accuracy_reward": 0.642693042755127,
"rewards/format_reward": 0.98046875,
"step": 349
},
{
"all_correct": 0.34375,
"all_wrong": 0.03125,
"completion_length": 96.1015625,
"epoch": 0.6616257088846881,
"grad_norm": 1.8681119609391044,
"kl": 0.048583984375,
"learning_rate": 5.137770542045062e-07,
"loss": 0.0019,
"reward": 1.688063144683838,
"reward_std": 0.23905277252197266,
"rewards/accuracy_reward": 0.6997818946838379,
"rewards/format_reward": 0.98828125,
"step": 350
},
{
"all_correct": 0.40625,
"all_wrong": 0.125,
"completion_length": 84.9453125,
"epoch": 0.6635160680529301,
"grad_norm": 1.4728027178236136,
"kl": 0.04833984375,
"learning_rate": 5.085961825767049e-07,
"loss": 0.0019,
"reward": 1.674993872642517,
"reward_std": 0.17204201221466064,
"rewards/accuracy_reward": 0.6789001226425171,
"rewards/format_reward": 0.99609375,
"step": 351
},
{
"all_correct": 0.3125,
"all_wrong": 0.21875,
"completion_length": 86.9140625,
"epoch": 0.665406427221172,
"grad_norm": 3.785173448605038,
"kl": 0.04443359375,
"learning_rate": 5.034326420298557e-07,
"loss": 0.0018,
"reward": 1.5950738191604614,
"reward_std": 0.18303368985652924,
"rewards/accuracy_reward": 0.5950738191604614,
"rewards/format_reward": 1.0,
"step": 352
},
{
"all_correct": 0.28125,
"all_wrong": 0.375,
"completion_length": 93.01953125,
"epoch": 0.667296786389414,
"grad_norm": 1.4357110637131671,
"kl": 0.0458984375,
"learning_rate": 4.982866146743464e-07,
"loss": 0.0018,
"reward": 1.3835440874099731,
"reward_std": 0.16043886542320251,
"rewards/accuracy_reward": 0.40307533740997314,
"rewards/format_reward": 0.98046875,
"step": 353
},
{
"all_correct": 0.40625,
"all_wrong": 0.0625,
"completion_length": 91.83203125,
"epoch": 0.6691871455576559,
"grad_norm": 2.0314351942520203,
"kl": 0.037841796875,
"learning_rate": 4.93158282002899e-07,
"loss": 0.0015,
"reward": 1.691582441329956,
"reward_std": 0.21125006675720215,
"rewards/accuracy_reward": 0.6915825605392456,
"rewards/format_reward": 1.0,
"step": 354
},
{
"all_correct": 0.28125,
"all_wrong": 0.3125,
"completion_length": 96.66015625,
"epoch": 0.6710775047258979,
"grad_norm": 1.360692718067674,
"kl": 0.046630859375,
"learning_rate": 4.880478248841706e-07,
"loss": 0.0019,
"reward": 1.4369994401931763,
"reward_std": 0.17168085277080536,
"rewards/accuracy_reward": 0.44090569019317627,
"rewards/format_reward": 0.99609375,
"step": 355
},
{
"all_correct": 0.25,
"all_wrong": 0.25,
"completion_length": 85.65234375,
"epoch": 0.6729678638941399,
"grad_norm": 2.116858815599637,
"kl": 0.047607421875,
"learning_rate": 4.82955423556375e-07,
"loss": 0.0019,
"reward": 1.5795400142669678,
"reward_std": 0.18496635556221008,
"rewards/accuracy_reward": 0.5795398950576782,
"rewards/format_reward": 1.0,
"step": 356
},
{
"all_correct": 0.375,
"all_wrong": 0.28125,
"completion_length": 83.1875,
"epoch": 0.6748582230623819,
"grad_norm": 1.7162608457835138,
"kl": 0.04833984375,
"learning_rate": 4.778812576209241e-07,
"loss": 0.0019,
"reward": 1.5707752704620361,
"reward_std": 0.09887948632240295,
"rewards/accuracy_reward": 0.5707752108573914,
"rewards/format_reward": 1.0,
"step": 357
},
{
"all_correct": 0.25,
"all_wrong": 0.15625,
"completion_length": 88.57421875,
"epoch": 0.6767485822306238,
"grad_norm": 2.366206137579118,
"kl": 0.0439453125,
"learning_rate": 4.728255060360955e-07,
"loss": 0.0018,
"reward": 1.6372836828231812,
"reward_std": 0.17435956001281738,
"rewards/accuracy_reward": 0.6372836828231812,
"rewards/format_reward": 1.0,
"step": 358
},
{
"all_correct": 0.34375,
"all_wrong": 0.1875,
"completion_length": 86.61328125,
"epoch": 0.6786389413988658,
"grad_norm": 1.692332788365531,
"kl": 0.0478515625,
"learning_rate": 4.6778834711071924e-07,
"loss": 0.0019,
"reward": 1.5796103477478027,
"reward_std": 0.15928372740745544,
"rewards/accuracy_reward": 0.5796103477478027,
"rewards/format_reward": 1.0,
"step": 359
},
{
"all_correct": 0.4375,
"all_wrong": 0.15625,
"completion_length": 90.30859375,
"epoch": 0.6805293005671077,
"grad_norm": 1.2973562747312646,
"kl": 0.05029296875,
"learning_rate": 4.627699584978911e-07,
"loss": 0.002,
"reward": 1.6404190063476562,
"reward_std": 0.14580589532852173,
"rewards/accuracy_reward": 0.640419065952301,
"rewards/format_reward": 1.0,
"step": 360
},
{
"all_correct": 0.375,
"all_wrong": 0.1875,
"completion_length": 91.734375,
"epoch": 0.6824196597353497,
"grad_norm": 1.924682020333693,
"kl": 0.045166015625,
"learning_rate": 4.57770517188705e-07,
"loss": 0.0018,
"reward": 1.5807889699935913,
"reward_std": 0.18111172318458557,
"rewards/accuracy_reward": 0.5846952199935913,
"rewards/format_reward": 0.99609375,
"step": 361
},
{
"all_correct": 0.4375,
"all_wrong": 0.15625,
"completion_length": 92.046875,
"epoch": 0.6843100189035917,
"grad_norm": 1.4869820010117951,
"kl": 0.044921875,
"learning_rate": 4.527901995060113e-07,
"loss": 0.0018,
"reward": 1.6711153984069824,
"reward_std": 0.16736207902431488,
"rewards/accuracy_reward": 0.6789278984069824,
"rewards/format_reward": 0.9921875,
"step": 362
},
{
"all_correct": 0.15625,
"all_wrong": 0.25,
"completion_length": 91.83984375,
"epoch": 0.6862003780718336,
"grad_norm": 1.7329104320683526,
"kl": 0.04150390625,
"learning_rate": 4.4782918109819976e-07,
"loss": 0.0017,
"reward": 1.4670283794403076,
"reward_std": 0.24669389426708221,
"rewards/accuracy_reward": 0.4670283794403076,
"rewards/format_reward": 1.0,
"step": 363
},
{
"all_correct": 0.40625,
"all_wrong": 0.25,
"completion_length": 93.6328125,
"epoch": 0.6880907372400756,
"grad_norm": 1.5230757953398353,
"kl": 0.052490234375,
"learning_rate": 4.4288763693300226e-07,
"loss": 0.0021,
"reward": 1.6058498620986938,
"reward_std": 0.14699707925319672,
"rewards/accuracy_reward": 0.6175686120986938,
"rewards/format_reward": 0.98828125,
"step": 364
},
{
"all_correct": 0.40625,
"all_wrong": 0.21875,
"completion_length": 96.92578125,
"epoch": 0.6899810964083176,
"grad_norm": 4.290738008536141,
"kl": 0.044677734375,
"learning_rate": 4.3796574129132423e-07,
"loss": 0.0018,
"reward": 1.560653805732727,
"reward_std": 0.18818287551403046,
"rewards/accuracy_reward": 0.580185055732727,
"rewards/format_reward": 0.98046875,
"step": 365
},
{
"all_correct": 0.34375,
"all_wrong": 0.1875,
"completion_length": 91.5859375,
"epoch": 0.6918714555765595,
"grad_norm": 2.5809981268174518,
"kl": 0.0478515625,
"learning_rate": 4.3306366776109616e-07,
"loss": 0.0019,
"reward": 1.5660797357559204,
"reward_std": 0.1301419585943222,
"rewards/accuracy_reward": 0.5660797357559204,
"rewards/format_reward": 1.0,
"step": 366
},
{
"all_correct": 0.46875,
"all_wrong": 0.09375,
"completion_length": 92.61328125,
"epoch": 0.6937618147448015,
"grad_norm": 2.395698693198084,
"kl": 0.047119140625,
"learning_rate": 4.2818158923115244e-07,
"loss": 0.0019,
"reward": 1.6860473155975342,
"reward_std": 0.17321570217609406,
"rewards/accuracy_reward": 0.6860473155975342,
"rewards/format_reward": 1.0,
"step": 367
},
{
"all_correct": 0.375,
"all_wrong": 0.1875,
"completion_length": 93.4375,
"epoch": 0.6956521739130435,
"grad_norm": 2.175508616602608,
"kl": 0.044677734375,
"learning_rate": 4.233196778851329e-07,
"loss": 0.0018,
"reward": 1.5961754322052002,
"reward_std": 0.15679945051670074,
"rewards/accuracy_reward": 0.5961754322052002,
"rewards/format_reward": 1.0,
"step": 368
},
{
"all_correct": 0.375,
"all_wrong": 0.21875,
"completion_length": 100.703125,
"epoch": 0.6975425330812854,
"grad_norm": 2.3623237744377685,
"kl": 0.043212890625,
"learning_rate": 4.184781051954125e-07,
"loss": 0.0017,
"reward": 1.6343427896499634,
"reward_std": 0.1671934723854065,
"rewards/accuracy_reward": 0.6343427896499634,
"rewards/format_reward": 1.0,
"step": 369
},
{
"all_correct": 0.3125,
"all_wrong": 0.09375,
"completion_length": 97.34765625,
"epoch": 0.6994328922495274,
"grad_norm": 1.8342117768253567,
"kl": 0.0419921875,
"learning_rate": 4.136570419170501e-07,
"loss": 0.0017,
"reward": 1.5979249477386475,
"reward_std": 0.2753554880619049,
"rewards/accuracy_reward": 0.6135499477386475,
"rewards/format_reward": 0.984375,
"step": 370
},
{
"all_correct": 0.3125,
"all_wrong": 0.15625,
"completion_length": 96.44140625,
"epoch": 0.7013232514177694,
"grad_norm": 1.8499863828085692,
"kl": 0.04345703125,
"learning_rate": 4.088566580817694e-07,
"loss": 0.0017,
"reward": 1.5595123767852783,
"reward_std": 0.18973296880722046,
"rewards/accuracy_reward": 0.5634186267852783,
"rewards/format_reward": 0.99609375,
"step": 371
},
{
"all_correct": 0.46875,
"all_wrong": 0.125,
"completion_length": 99.48828125,
"epoch": 0.7032136105860114,
"grad_norm": 2.208899799066235,
"kl": 0.04541015625,
"learning_rate": 4.040771229919612e-07,
"loss": 0.0018,
"reward": 1.6975526809692383,
"reward_std": 0.17384248971939087,
"rewards/accuracy_reward": 0.7053651809692383,
"rewards/format_reward": 0.9921875,
"step": 372
},
{
"all_correct": 0.28125,
"all_wrong": 0.21875,
"completion_length": 87.18359375,
"epoch": 0.7051039697542533,
"grad_norm": 1.8987142352577209,
"kl": 0.04638671875,
"learning_rate": 3.9931860521471097e-07,
"loss": 0.0019,
"reward": 1.5160590410232544,
"reward_std": 0.21702983975410461,
"rewards/accuracy_reward": 0.5160590410232544,
"rewards/format_reward": 1.0,
"step": 373
},
{
"all_correct": 0.40625,
"all_wrong": 0.09375,
"completion_length": 92.609375,
"epoch": 0.7069943289224953,
"grad_norm": 15.04217885199878,
"kl": 0.046142578125,
"learning_rate": 3.945812725758554e-07,
"loss": 0.0018,
"reward": 1.7074790000915527,
"reward_std": 0.19710449874401093,
"rewards/accuracy_reward": 0.7113852500915527,
"rewards/format_reward": 0.99609375,
"step": 374
},
{
"all_correct": 0.375,
"all_wrong": 0.21875,
"completion_length": 92.65625,
"epoch": 0.7088846880907372,
"grad_norm": 1.91152067931555,
"kl": 0.043701171875,
"learning_rate": 3.898652921540627e-07,
"loss": 0.0017,
"reward": 1.549987554550171,
"reward_std": 0.15819934010505676,
"rewards/accuracy_reward": 0.5617063641548157,
"rewards/format_reward": 0.98828125,
"step": 375
},
{
"all_correct": 0.25,
"all_wrong": 0.28125,
"completion_length": 89.109375,
"epoch": 0.7107750472589792,
"grad_norm": 1.585271716791016,
"kl": 0.052001953125,
"learning_rate": 3.851708302749409e-07,
"loss": 0.0021,
"reward": 1.4718488454818726,
"reward_std": 0.15529434382915497,
"rewards/accuracy_reward": 0.47184884548187256,
"rewards/format_reward": 1.0,
"step": 376
},
{
"all_correct": 0.3125,
"all_wrong": 0.21875,
"completion_length": 91.43359375,
"epoch": 0.7126654064272212,
"grad_norm": 1.6292096929321715,
"kl": 0.039794921875,
"learning_rate": 3.8049805250517e-07,
"loss": 0.0016,
"reward": 1.5054469108581543,
"reward_std": 0.1916845291852951,
"rewards/accuracy_reward": 0.5054467916488647,
"rewards/format_reward": 1.0,
"step": 377
},
{
"all_correct": 0.28125,
"all_wrong": 0.09375,
"completion_length": 87.43359375,
"epoch": 0.7145557655954632,
"grad_norm": 2.5257432847150865,
"kl": 0.045654296875,
"learning_rate": 3.7584712364666493e-07,
"loss": 0.0018,
"reward": 1.5939102172851562,
"reward_std": 0.24795284867286682,
"rewards/accuracy_reward": 0.5939102172851562,
"rewards/format_reward": 1.0,
"step": 378
},
{
"all_correct": 0.1875,
"all_wrong": 0.09375,
"completion_length": 94.90625,
"epoch": 0.7164461247637051,
"grad_norm": 2.7673062583960317,
"kl": 0.045166015625,
"learning_rate": 3.7121820773076097e-07,
"loss": 0.0018,
"reward": 1.5857834815979004,
"reward_std": 0.25405406951904297,
"rewards/accuracy_reward": 0.5857834219932556,
"rewards/format_reward": 1.0,
"step": 379
},
{
"all_correct": 0.28125,
"all_wrong": 0.09375,
"completion_length": 90.87109375,
"epoch": 0.718336483931947,
"grad_norm": 2.2822899165231294,
"kl": 0.047119140625,
"learning_rate": 3.666114680124298e-07,
"loss": 0.0019,
"reward": 1.5186080932617188,
"reward_std": 0.1982262283563614,
"rewards/accuracy_reward": 0.5186082124710083,
"rewards/format_reward": 1.0,
"step": 380
},
{
"all_correct": 0.21875,
"all_wrong": 0.21875,
"completion_length": 87.87890625,
"epoch": 0.720226843100189,
"grad_norm": 2.6361171968305346,
"kl": 0.047607421875,
"learning_rate": 3.620270669645228e-07,
"loss": 0.0019,
"reward": 1.43359375,
"reward_std": 0.2574925422668457,
"rewards/accuracy_reward": 0.43359375,
"rewards/format_reward": 1.0,
"step": 381
},
{
"all_correct": 0.25,
"all_wrong": 0.1875,
"completion_length": 85.53515625,
"epoch": 0.722117202268431,
"grad_norm": 1.934475962789802,
"kl": 0.04736328125,
"learning_rate": 3.5746516627203816e-07,
"loss": 0.0019,
"reward": 1.5397059917449951,
"reward_std": 0.2019021213054657,
"rewards/accuracy_reward": 0.5397060513496399,
"rewards/format_reward": 1.0,
"step": 382
},
{
"all_correct": 0.34375,
"all_wrong": 0.15625,
"completion_length": 89.05859375,
"epoch": 0.724007561436673,
"grad_norm": 1.7765413930033112,
"kl": 0.04248046875,
"learning_rate": 3.529259268264213e-07,
"loss": 0.0017,
"reward": 1.6433665752410889,
"reward_std": 0.1822570264339447,
"rewards/accuracy_reward": 0.6433665752410889,
"rewards/format_reward": 1.0,
"step": 383
},
{
"all_correct": 0.4375,
"all_wrong": 0.21875,
"completion_length": 90.70703125,
"epoch": 0.725897920604915,
"grad_norm": 3.235569329484398,
"kl": 0.04296875,
"learning_rate": 3.4840950871988806e-07,
"loss": 0.0017,
"reward": 1.6414120197296143,
"reward_std": 0.1383558064699173,
"rewards/accuracy_reward": 0.6414120197296143,
"rewards/format_reward": 1.0,
"step": 384
},
{
"all_correct": 0.4375,
"all_wrong": 0.15625,
"completion_length": 84.59765625,
"epoch": 0.7277882797731569,
"grad_norm": 2.5415509345709726,
"kl": 0.052978515625,
"learning_rate": 3.4391607123978096e-07,
"loss": 0.0021,
"reward": 1.6038849353790283,
"reward_std": 0.14968551695346832,
"rewards/accuracy_reward": 0.6038850545883179,
"rewards/format_reward": 1.0,
"step": 385
},
{
"all_correct": 0.28125,
"all_wrong": 0.1875,
"completion_length": 92.71484375,
"epoch": 0.7296786389413988,
"grad_norm": 1.9594609018418163,
"kl": 0.04736328125,
"learning_rate": 3.3944577286294886e-07,
"loss": 0.0019,
"reward": 1.5191731452941895,
"reward_std": 0.21669438481330872,
"rewards/accuracy_reward": 0.5308918952941895,
"rewards/format_reward": 0.98828125,
"step": 386
},
{
"all_correct": 0.3125,
"all_wrong": 0.21875,
"completion_length": 90.5,
"epoch": 0.7315689981096408,
"grad_norm": 1.5282132767893477,
"kl": 0.044189453125,
"learning_rate": 3.3499877125015907e-07,
"loss": 0.0018,
"reward": 1.5483942031860352,
"reward_std": 0.1809859573841095,
"rewards/accuracy_reward": 0.5523004531860352,
"rewards/format_reward": 0.99609375,
"step": 387
},
{
"all_correct": 0.34375,
"all_wrong": 0.21875,
"completion_length": 88.44921875,
"epoch": 0.7334593572778828,
"grad_norm": 1.511915251166267,
"kl": 0.04443359375,
"learning_rate": 3.305752232405377e-07,
"loss": 0.0018,
"reward": 1.5602011680603027,
"reward_std": 0.1667328178882599,
"rewards/accuracy_reward": 0.560201108455658,
"rewards/format_reward": 1.0,
"step": 388
},
{
"all_correct": 0.28125,
"all_wrong": 0.125,
"completion_length": 87.83203125,
"epoch": 0.7353497164461248,
"grad_norm": 2.450602394222447,
"kl": 0.0419921875,
"learning_rate": 3.2617528484603574e-07,
"loss": 0.0017,
"reward": 1.6159805059432983,
"reward_std": 0.22847887873649597,
"rewards/accuracy_reward": 0.6159805059432983,
"rewards/format_reward": 1.0,
"step": 389
},
{
"all_correct": 0.40625,
"all_wrong": 0.125,
"completion_length": 93.2890625,
"epoch": 0.7372400756143668,
"grad_norm": 1.5610935075912176,
"kl": 0.043212890625,
"learning_rate": 3.217991112459296e-07,
"loss": 0.0017,
"reward": 1.614638328552246,
"reward_std": 0.19237719476222992,
"rewards/accuracy_reward": 0.6146383285522461,
"rewards/format_reward": 1.0,
"step": 390
},
{
"all_correct": 0.40625,
"all_wrong": 0.09375,
"completion_length": 89.69921875,
"epoch": 0.7391304347826086,
"grad_norm": 1.6750006945933873,
"kl": 0.041259765625,
"learning_rate": 3.174468567813461e-07,
"loss": 0.0017,
"reward": 1.703397274017334,
"reward_std": 0.19982343912124634,
"rewards/accuracy_reward": 0.703397274017334,
"rewards/format_reward": 1.0,
"step": 391
},
{
"all_correct": 0.46875,
"all_wrong": 0.1875,
"completion_length": 88.98046875,
"epoch": 0.7410207939508506,
"grad_norm": 1.5050532217987163,
"kl": 0.040771484375,
"learning_rate": 3.131186749498195e-07,
"loss": 0.0016,
"reward": 1.6249730587005615,
"reward_std": 0.1348705291748047,
"rewards/accuracy_reward": 0.6249730587005615,
"rewards/format_reward": 1.0,
"step": 392
},
{
"all_correct": 0.40625,
"all_wrong": 0.1875,
"completion_length": 86.80078125,
"epoch": 0.7429111531190926,
"grad_norm": 2.2056728949358497,
"kl": 0.047607421875,
"learning_rate": 3.0881471839987815e-07,
"loss": 0.0019,
"reward": 1.6306958198547363,
"reward_std": 0.15101388096809387,
"rewards/accuracy_reward": 0.6306958198547363,
"rewards/format_reward": 1.0,
"step": 393
},
{
"all_correct": 0.3125,
"all_wrong": 0.21875,
"completion_length": 91.85546875,
"epoch": 0.7448015122873346,
"grad_norm": 2.1315080678559024,
"kl": 0.044921875,
"learning_rate": 3.0453513892566195e-07,
"loss": 0.0018,
"reward": 1.544640302658081,
"reward_std": 0.19710536301136017,
"rewards/accuracy_reward": 0.5446402430534363,
"rewards/format_reward": 1.0,
"step": 394
},
{
"all_correct": 0.40625,
"all_wrong": 0.1875,
"completion_length": 85.7109375,
"epoch": 0.7466918714555766,
"grad_norm": 2.06732415423423,
"kl": 0.04931640625,
"learning_rate": 3.0028008746156587e-07,
"loss": 0.002,
"reward": 1.5586893558502197,
"reward_std": 0.15845312178134918,
"rewards/accuracy_reward": 0.5586893558502197,
"rewards/format_reward": 1.0,
"step": 395
},
{
"all_correct": 0.5,
"all_wrong": 0.09375,
"completion_length": 93.92578125,
"epoch": 0.7485822306238186,
"grad_norm": 1.8815000905698593,
"kl": 0.046630859375,
"learning_rate": 2.9604971407692026e-07,
"loss": 0.0019,
"reward": 1.697596788406372,
"reward_std": 0.13653597235679626,
"rewards/accuracy_reward": 0.6975967288017273,
"rewards/format_reward": 1.0,
"step": 396
},
{
"all_correct": 0.21875,
"all_wrong": 0.375,
"completion_length": 93.3359375,
"epoch": 0.7504725897920604,
"grad_norm": 1.5391602816018224,
"kl": 0.041015625,
"learning_rate": 2.918441679706949e-07,
"loss": 0.0016,
"reward": 1.386269211769104,
"reward_std": 0.18272624909877777,
"rewards/accuracy_reward": 0.394081711769104,
"rewards/format_reward": 0.9921875,
"step": 397
},
{
"all_correct": 0.3125,
"all_wrong": 0.28125,
"completion_length": 90.81640625,
"epoch": 0.7523629489603024,
"grad_norm": 1.1599213261291745,
"kl": 0.043701171875,
"learning_rate": 2.876635974662389e-07,
"loss": 0.0017,
"reward": 1.473933458328247,
"reward_std": 0.16899192333221436,
"rewards/accuracy_reward": 0.47783973813056946,
"rewards/format_reward": 0.99609375,
"step": 398
},
{
"all_correct": 0.28125,
"all_wrong": 0.21875,
"completion_length": 90.78515625,
"epoch": 0.7542533081285444,
"grad_norm": 1.8839648182094828,
"kl": 0.041748046875,
"learning_rate": 2.8350815000604976e-07,
"loss": 0.0017,
"reward": 1.55859375,
"reward_std": 0.23474985361099243,
"rewards/accuracy_reward": 0.5703125,
"rewards/format_reward": 0.98828125,
"step": 399
},
{
"all_correct": 0.4375,
"all_wrong": 0.1875,
"completion_length": 93.2578125,
"epoch": 0.7561436672967864,
"grad_norm": 1.4293529083675727,
"kl": 0.04541015625,
"learning_rate": 2.7937797214657143e-07,
"loss": 0.0018,
"reward": 1.5872396230697632,
"reward_std": 0.1798129379749298,
"rewards/accuracy_reward": 0.6106771230697632,
"rewards/format_reward": 0.9765625,
"step": 400
},
{
"all_correct": 0.28125,
"all_wrong": 0.25,
"completion_length": 86.91796875,
"epoch": 0.7580340264650284,
"grad_norm": 2.026608250956968,
"kl": 0.04541015625,
"learning_rate": 2.752732095530279e-07,
"loss": 0.0018,
"reward": 1.4875531196594238,
"reward_std": 0.18699489533901215,
"rewards/accuracy_reward": 0.48755308985710144,
"rewards/format_reward": 1.0,
"step": 401
},
{
"all_correct": 0.53125,
"all_wrong": 0.15625,
"completion_length": 99.1640625,
"epoch": 0.7599243856332704,
"grad_norm": 1.5439354460633543,
"kl": 0.04443359375,
"learning_rate": 2.711940069942833e-07,
"loss": 0.0018,
"reward": 1.6947214603424072,
"reward_std": 0.1390814185142517,
"rewards/accuracy_reward": 0.7181590795516968,
"rewards/format_reward": 0.9765625,
"step": 402
},
{
"all_correct": 0.375,
"all_wrong": 0.09375,
"completion_length": 96.125,
"epoch": 0.7618147448015122,
"grad_norm": 1.9024078180465291,
"kl": 0.039794921875,
"learning_rate": 2.671405083377386e-07,
"loss": 0.0016,
"reward": 1.6169142723083496,
"reward_std": 0.19913235306739807,
"rewards/accuracy_reward": 0.6169142723083496,
"rewards/format_reward": 1.0,
"step": 403
},
{
"all_correct": 0.34375,
"all_wrong": 0.09375,
"completion_length": 94.4921875,
"epoch": 0.7637051039697542,
"grad_norm": 2.8962041644930108,
"kl": 0.042724609375,
"learning_rate": 2.6311285654425574e-07,
"loss": 0.0017,
"reward": 1.6525933742523193,
"reward_std": 0.25046759843826294,
"rewards/accuracy_reward": 0.6525933146476746,
"rewards/format_reward": 1.0,
"step": 404
},
{
"all_correct": 0.3125,
"all_wrong": 0.25,
"completion_length": 87.58203125,
"epoch": 0.7655954631379962,
"grad_norm": 1.4168894497370028,
"kl": 0.0419921875,
"learning_rate": 2.59111193663116e-07,
"loss": 0.0017,
"reward": 1.505859375,
"reward_std": 0.17382082343101501,
"rewards/accuracy_reward": 0.505859375,
"rewards/format_reward": 1.0,
"step": 405
},
{
"all_correct": 0.40625,
"all_wrong": 0.15625,
"completion_length": 90.953125,
"epoch": 0.7674858223062382,
"grad_norm": 2.400477941677905,
"kl": 0.044677734375,
"learning_rate": 2.5513566082701134e-07,
"loss": 0.0018,
"reward": 1.6569660902023315,
"reward_std": 0.14563217759132385,
"rewards/accuracy_reward": 0.6569661498069763,
"rewards/format_reward": 1.0,
"step": 406
},
{
"all_correct": 0.40625,
"all_wrong": 0.1875,
"completion_length": 94.703125,
"epoch": 0.7693761814744802,
"grad_norm": 1.2607140346650068,
"kl": 0.03515625,
"learning_rate": 2.51186398247065e-07,
"loss": 0.0014,
"reward": 1.6192121505737305,
"reward_std": 0.11734330654144287,
"rewards/accuracy_reward": 0.6192121505737305,
"rewards/format_reward": 1.0,
"step": 407
},
{
"all_correct": 0.28125,
"all_wrong": 0.1875,
"completion_length": 91.6015625,
"epoch": 0.7712665406427222,
"grad_norm": 3.7794955364804887,
"kl": 0.044677734375,
"learning_rate": 2.472635452078883e-07,
"loss": 0.0018,
"reward": 1.5780255794525146,
"reward_std": 0.1755901575088501,
"rewards/accuracy_reward": 0.5780255794525146,
"rewards/format_reward": 1.0,
"step": 408
},
{
"all_correct": 0.40625,
"all_wrong": 0.25,
"completion_length": 89.140625,
"epoch": 0.7731568998109641,
"grad_norm": 1.6217685852043846,
"kl": 0.0400390625,
"learning_rate": 2.433672400626663e-07,
"loss": 0.0016,
"reward": 1.6167256832122803,
"reward_std": 0.1386527717113495,
"rewards/accuracy_reward": 0.616725742816925,
"rewards/format_reward": 1.0,
"step": 409
},
{
"all_correct": 0.28125,
"all_wrong": 0.21875,
"completion_length": 88.8203125,
"epoch": 0.775047258979206,
"grad_norm": 2.028364703410531,
"kl": 0.051025390625,
"learning_rate": 2.3949762022828093e-07,
"loss": 0.002,
"reward": 1.5439236164093018,
"reward_std": 0.22322307527065277,
"rewards/accuracy_reward": 0.5439236164093018,
"rewards/format_reward": 1.0,
"step": 410
},
{
"all_correct": 0.28125,
"all_wrong": 0.1875,
"completion_length": 92.33203125,
"epoch": 0.776937618147448,
"grad_norm": 2.2480672974329035,
"kl": 0.04931640625,
"learning_rate": 2.3565482218046073e-07,
"loss": 0.002,
"reward": 1.567735195159912,
"reward_std": 0.2084602564573288,
"rewards/accuracy_reward": 0.5677351355552673,
"rewards/format_reward": 1.0,
"step": 411
},
{
"all_correct": 0.5625,
"all_wrong": 0.0625,
"completion_length": 94.765625,
"epoch": 0.77882797731569,
"grad_norm": 1.8397495365435537,
"kl": 0.04248046875,
"learning_rate": 2.3183898144897175e-07,
"loss": 0.0017,
"reward": 1.7432655096054077,
"reward_std": 0.177871972322464,
"rewards/accuracy_reward": 0.7432655096054077,
"rewards/format_reward": 1.0,
"step": 412
},
{
"all_correct": 0.1875,
"all_wrong": 0.25,
"completion_length": 90.35546875,
"epoch": 0.780718336483932,
"grad_norm": 2.4627560536575497,
"kl": 0.0458984375,
"learning_rate": 2.2805023261283496e-07,
"loss": 0.0018,
"reward": 1.4680068492889404,
"reward_std": 0.2125786542892456,
"rewards/accuracy_reward": 0.46800681948661804,
"rewards/format_reward": 1.0,
"step": 413
},
{
"all_correct": 0.15625,
"all_wrong": 0.21875,
"completion_length": 96.5859375,
"epoch": 0.782608695652174,
"grad_norm": 21.851694077321632,
"kl": 0.042724609375,
"learning_rate": 2.2428870929558007e-07,
"loss": 0.0017,
"reward": 1.4453372955322266,
"reward_std": 0.26194411516189575,
"rewards/accuracy_reward": 0.4453372359275818,
"rewards/format_reward": 1.0,
"step": 414
},
{
"all_correct": 0.28125,
"all_wrong": 0.3125,
"completion_length": 95.6015625,
"epoch": 0.7844990548204159,
"grad_norm": 2.1610667049451417,
"kl": 0.04833984375,
"learning_rate": 2.205545441605342e-07,
"loss": 0.0019,
"reward": 1.4564586877822876,
"reward_std": 0.1753809005022049,
"rewards/accuracy_reward": 0.46427121758461,
"rewards/format_reward": 0.9921875,
"step": 415
},
{
"all_correct": 0.34375,
"all_wrong": 0.21875,
"completion_length": 91.46875,
"epoch": 0.7863894139886578,
"grad_norm": 10.481690705107473,
"kl": 0.04443359375,
"learning_rate": 2.1684786890614127e-07,
"loss": 0.0018,
"reward": 1.5876367092132568,
"reward_std": 0.17977751791477203,
"rewards/accuracy_reward": 0.5876367092132568,
"rewards/format_reward": 1.0,
"step": 416
},
{
"all_correct": 0.34375,
"all_wrong": 0.125,
"completion_length": 94.65625,
"epoch": 0.7882797731568998,
"grad_norm": 2.8247468950858647,
"kl": 0.039306640625,
"learning_rate": 2.1316881426131827e-07,
"loss": 0.0016,
"reward": 1.6735260486602783,
"reward_std": 0.17629718780517578,
"rewards/accuracy_reward": 0.6735259890556335,
"rewards/format_reward": 1.0,
"step": 417
},
{
"all_correct": 0.25,
"all_wrong": 0.125,
"completion_length": 95.9765625,
"epoch": 0.7901701323251418,
"grad_norm": 1.9238663712431303,
"kl": 0.041259765625,
"learning_rate": 2.0951750998084438e-07,
"loss": 0.0016,
"reward": 1.5885775089263916,
"reward_std": 0.2954632043838501,
"rewards/accuracy_reward": 0.6002963781356812,
"rewards/format_reward": 0.98828125,
"step": 418
},
{
"all_correct": 0.28125,
"all_wrong": 0.1875,
"completion_length": 92.58203125,
"epoch": 0.7920604914933838,
"grad_norm": 2.082634838675884,
"kl": 0.04296875,
"learning_rate": 2.058940848407854e-07,
"loss": 0.0017,
"reward": 1.5185022354125977,
"reward_std": 0.21989545226097107,
"rewards/accuracy_reward": 0.5185022950172424,
"rewards/format_reward": 1.0,
"step": 419
},
{
"all_correct": 0.34375,
"all_wrong": 0.15625,
"completion_length": 95.96484375,
"epoch": 0.7939508506616257,
"grad_norm": 1.4637336629082067,
"kl": 0.038818359375,
"learning_rate": 2.0229866663395023e-07,
"loss": 0.0016,
"reward": 1.5763494968414307,
"reward_std": 0.22779253125190735,
"rewards/accuracy_reward": 0.5919744372367859,
"rewards/format_reward": 0.984375,
"step": 420
},
{
"all_correct": 0.4375,
"all_wrong": 0.1875,
"completion_length": 94.42578125,
"epoch": 0.7958412098298677,
"grad_norm": 1.3544480164552257,
"kl": 0.0419921875,
"learning_rate": 1.9873138216538609e-07,
"loss": 0.0017,
"reward": 1.6315104961395264,
"reward_std": 0.15740279853343964,
"rewards/accuracy_reward": 0.6354166865348816,
"rewards/format_reward": 0.99609375,
"step": 421
},
{
"all_correct": 0.25,
"all_wrong": 0.28125,
"completion_length": 91.765625,
"epoch": 0.7977315689981096,
"grad_norm": 9.201439231037005,
"kl": 0.042724609375,
"learning_rate": 1.951923572479044e-07,
"loss": 0.0017,
"reward": 1.479612112045288,
"reward_std": 0.19981667399406433,
"rewards/accuracy_reward": 0.4796121120452881,
"rewards/format_reward": 1.0,
"step": 422
},
{
"all_correct": 0.28125,
"all_wrong": 0.1875,
"completion_length": 94.84765625,
"epoch": 0.7996219281663516,
"grad_norm": 1.4237426067896413,
"kl": 0.048583984375,
"learning_rate": 1.916817166976441e-07,
"loss": 0.0019,
"reward": 1.5277478694915771,
"reward_std": 0.2083957940340042,
"rewards/accuracy_reward": 0.5355602502822876,
"rewards/format_reward": 0.9921875,
"step": 423
},
{
"all_correct": 0.3125,
"all_wrong": 0.1875,
"completion_length": 91.10546875,
"epoch": 0.8015122873345936,
"grad_norm": 2.293182503990981,
"kl": 0.041259765625,
"learning_rate": 1.8819958432967076e-07,
"loss": 0.0017,
"reward": 1.558853268623352,
"reward_std": 0.18910501897335052,
"rewards/accuracy_reward": 0.562759518623352,
"rewards/format_reward": 0.99609375,
"step": 424
},
{
"all_correct": 0.40625,
"all_wrong": 0.09375,
"completion_length": 95.41015625,
"epoch": 0.8034026465028355,
"grad_norm": 1.5366707245335558,
"kl": 0.0419921875,
"learning_rate": 1.847460829536075e-07,
"loss": 0.0017,
"reward": 1.6059564352035522,
"reward_std": 0.2242327332496643,
"rewards/accuracy_reward": 0.6215814352035522,
"rewards/format_reward": 0.984375,
"step": 425
},
{
"all_correct": 0.375,
"all_wrong": 0.21875,
"completion_length": 90.1484375,
"epoch": 0.8052930056710775,
"grad_norm": 1.3312039879338635,
"kl": 0.041748046875,
"learning_rate": 1.813213343693064e-07,
"loss": 0.0017,
"reward": 1.5957437753677368,
"reward_std": 0.16802389919757843,
"rewards/accuracy_reward": 0.595743715763092,
"rewards/format_reward": 1.0,
"step": 426
},
{
"all_correct": 0.34375,
"all_wrong": 0.09375,
"completion_length": 97.79296875,
"epoch": 0.8071833648393195,
"grad_norm": 2.9517888821154714,
"kl": 0.04248046875,
"learning_rate": 1.779254593625501e-07,
"loss": 0.0017,
"reward": 1.6021231412887573,
"reward_std": 0.22704648971557617,
"rewards/accuracy_reward": 0.6021231412887573,
"rewards/format_reward": 1.0,
"step": 427
},
{
"all_correct": 0.375,
"all_wrong": 0.25,
"completion_length": 92.49609375,
"epoch": 0.8090737240075614,
"grad_norm": 1.741276036032744,
"kl": 0.048095703125,
"learning_rate": 1.745585777007943e-07,
"loss": 0.0019,
"reward": 1.5888817310333252,
"reward_std": 0.09943927079439163,
"rewards/accuracy_reward": 0.5888816714286804,
"rewards/format_reward": 1.0,
"step": 428
},
{
"all_correct": 0.4375,
"all_wrong": 0.09375,
"completion_length": 95.453125,
"epoch": 0.8109640831758034,
"grad_norm": 1.5867733017615009,
"kl": 0.03955078125,
"learning_rate": 1.7122080812894146e-07,
"loss": 0.0016,
"reward": 1.6287798881530762,
"reward_std": 0.19112670421600342,
"rewards/accuracy_reward": 0.6287798881530762,
"rewards/format_reward": 1.0,
"step": 429
},
{
"all_correct": 0.21875,
"all_wrong": 0.15625,
"completion_length": 97.41796875,
"epoch": 0.8128544423440454,
"grad_norm": 1.8869110513185203,
"kl": 0.04296875,
"learning_rate": 1.679122683651546e-07,
"loss": 0.0017,
"reward": 1.5692415237426758,
"reward_std": 0.24745473265647888,
"rewards/accuracy_reward": 0.569241464138031,
"rewards/format_reward": 1.0,
"step": 430
},
{
"all_correct": 0.3125,
"all_wrong": 0.15625,
"completion_length": 96.97265625,
"epoch": 0.8147448015122873,
"grad_norm": 2.0763617955307274,
"kl": 0.0478515625,
"learning_rate": 1.6463307509670522e-07,
"loss": 0.0019,
"reward": 1.5514062643051147,
"reward_std": 0.21106550097465515,
"rewards/accuracy_reward": 0.5514062643051147,
"rewards/format_reward": 1.0,
"step": 431
},
{
"all_correct": 0.34375,
"all_wrong": 0.21875,
"completion_length": 96.08984375,
"epoch": 0.8166351606805293,
"grad_norm": 4.650342860685201,
"kl": 0.044189453125,
"learning_rate": 1.6138334397585674e-07,
"loss": 0.0018,
"reward": 1.5383667945861816,
"reward_std": 0.166859969496727,
"rewards/accuracy_reward": 0.5500854849815369,
"rewards/format_reward": 0.98828125,
"step": 432
},
{
"all_correct": 0.3125,
"all_wrong": 0.25,
"completion_length": 90.95703125,
"epoch": 0.8185255198487713,
"grad_norm": 1.7529564775308286,
"kl": 0.04150390625,
"learning_rate": 1.5816318961578756e-07,
"loss": 0.0017,
"reward": 1.4878764152526855,
"reward_std": 0.18354278802871704,
"rewards/accuracy_reward": 0.48787635564804077,
"rewards/format_reward": 1.0,
"step": 433
},
{
"all_correct": 0.25,
"all_wrong": 0.21875,
"completion_length": 85.59375,
"epoch": 0.8204158790170132,
"grad_norm": 3.6359061398641677,
"kl": 0.048095703125,
"learning_rate": 1.5497272558654695e-07,
"loss": 0.0019,
"reward": 1.4656922817230225,
"reward_std": 0.2116546779870987,
"rewards/accuracy_reward": 0.46959853172302246,
"rewards/format_reward": 0.99609375,
"step": 434
},
{
"all_correct": 0.40625,
"all_wrong": 0.09375,
"completion_length": 95.8515625,
"epoch": 0.8223062381852552,
"grad_norm": 4.974875276682775,
"kl": 0.046142578125,
"learning_rate": 1.5181206441105077e-07,
"loss": 0.0018,
"reward": 1.6275532245635986,
"reward_std": 0.20301690697669983,
"rewards/accuracy_reward": 0.6314594745635986,
"rewards/format_reward": 0.99609375,
"step": 435
},
{
"all_correct": 0.4375,
"all_wrong": 0.125,
"completion_length": 90.0625,
"epoch": 0.8241965973534972,
"grad_norm": 2.449754366709296,
"kl": 0.04296875,
"learning_rate": 1.4868131756111223e-07,
"loss": 0.0017,
"reward": 1.5798760652542114,
"reward_std": 0.2174547016620636,
"rewards/accuracy_reward": 0.5994073152542114,
"rewards/format_reward": 0.98046875,
"step": 436
},
{
"all_correct": 0.40625,
"all_wrong": 0.125,
"completion_length": 99.25390625,
"epoch": 0.8260869565217391,
"grad_norm": 1.4549018063192884,
"kl": 0.047607421875,
"learning_rate": 1.4558059545351142e-07,
"loss": 0.0019,
"reward": 1.5477688312530518,
"reward_std": 0.2062843143939972,
"rewards/accuracy_reward": 0.5673000812530518,
"rewards/format_reward": 0.98046875,
"step": 437
},
{
"all_correct": 0.40625,
"all_wrong": 0.1875,
"completion_length": 91.74609375,
"epoch": 0.8279773156899811,
"grad_norm": 1.2525640014161112,
"kl": 0.04248046875,
"learning_rate": 1.425100074461003e-07,
"loss": 0.0017,
"reward": 1.6436383724212646,
"reward_std": 0.1697710007429123,
"rewards/accuracy_reward": 0.6436384320259094,
"rewards/format_reward": 1.0,
"step": 438
},
{
"all_correct": 0.40625,
"all_wrong": 0.09375,
"completion_length": 93.125,
"epoch": 0.8298676748582231,
"grad_norm": 2.3121952823917042,
"kl": 0.040283203125,
"learning_rate": 1.394696618339456e-07,
"loss": 0.0016,
"reward": 1.6626487970352173,
"reward_std": 0.174302339553833,
"rewards/accuracy_reward": 0.6626487970352173,
"rewards/format_reward": 1.0,
"step": 439
},
{
"all_correct": 0.3125,
"all_wrong": 0.34375,
"completion_length": 93.94140625,
"epoch": 0.831758034026465,
"grad_norm": 1.3605971037488336,
"kl": 0.045166015625,
"learning_rate": 1.364596658455105e-07,
"loss": 0.0018,
"reward": 1.5049912929534912,
"reward_std": 0.12267406284809113,
"rewards/accuracy_reward": 0.504991352558136,
"rewards/format_reward": 1.0,
"step": 440
},
{
"all_correct": 0.40625,
"all_wrong": 0.1875,
"completion_length": 91.05859375,
"epoch": 0.833648393194707,
"grad_norm": 1.4329708155173375,
"kl": 0.043701171875,
"learning_rate": 1.33480125638871e-07,
"loss": 0.0017,
"reward": 1.6171175241470337,
"reward_std": 0.1537449210882187,
"rewards/accuracy_reward": 0.6210237741470337,
"rewards/format_reward": 0.99609375,
"step": 441
},
{
"all_correct": 0.375,
"all_wrong": 0.1875,
"completion_length": 88.9765625,
"epoch": 0.8355387523629489,
"grad_norm": 17.688886780647458,
"kl": 0.0400390625,
"learning_rate": 1.3053114629797435e-07,
"loss": 0.0016,
"reward": 1.5753612518310547,
"reward_std": 0.18671679496765137,
"rewards/accuracy_reward": 0.5753612518310547,
"rewards/format_reward": 1.0,
"step": 442
},
{
"all_correct": 0.375,
"all_wrong": 0.15625,
"completion_length": 98.7578125,
"epoch": 0.8374291115311909,
"grad_norm": 1.9625832093816682,
"kl": 0.045654296875,
"learning_rate": 1.2761283182893047e-07,
"loss": 0.0018,
"reward": 1.5934289693832397,
"reward_std": 0.21815939247608185,
"rewards/accuracy_reward": 0.5934289693832397,
"rewards/format_reward": 1.0,
"step": 443
},
{
"all_correct": 0.40625,
"all_wrong": 0.125,
"completion_length": 96.1328125,
"epoch": 0.8393194706994329,
"grad_norm": 1.580613447705244,
"kl": 0.039794921875,
"learning_rate": 1.2472528515634585e-07,
"loss": 0.0016,
"reward": 1.6483333110809326,
"reward_std": 0.19740188121795654,
"rewards/accuracy_reward": 0.6483333110809326,
"rewards/format_reward": 1.0,
"step": 444
},
{
"all_correct": 0.53125,
"all_wrong": 0.1875,
"completion_length": 94.78125,
"epoch": 0.8412098298676749,
"grad_norm": 1.1020475480569663,
"kl": 0.03955078125,
"learning_rate": 1.2186860811969168e-07,
"loss": 0.0016,
"reward": 1.702857494354248,
"reward_std": 0.13044710457324982,
"rewards/accuracy_reward": 0.7028576135635376,
"rewards/format_reward": 1.0,
"step": 445
},
{
"all_correct": 0.375,
"all_wrong": 0.125,
"completion_length": 94.3515625,
"epoch": 0.8431001890359168,
"grad_norm": 3.1193628774228603,
"kl": 0.048583984375,
"learning_rate": 1.1904290146971397e-07,
"loss": 0.0019,
"reward": 1.6091580390930176,
"reward_std": 0.2114795446395874,
"rewards/accuracy_reward": 0.6130642294883728,
"rewards/format_reward": 0.99609375,
"step": 446
},
{
"all_correct": 0.5,
"all_wrong": 0.15625,
"completion_length": 93.82421875,
"epoch": 0.8449905482041588,
"grad_norm": 1.5499881081679403,
"kl": 0.046142578125,
"learning_rate": 1.1624826486487872e-07,
"loss": 0.0018,
"reward": 1.68212890625,
"reward_std": 0.1655098795890808,
"rewards/accuracy_reward": 0.68994140625,
"rewards/format_reward": 0.9921875,
"step": 447
},
{
"all_correct": 0.46875,
"all_wrong": 0.1875,
"completion_length": 96.47265625,
"epoch": 0.8468809073724007,
"grad_norm": 1.3637912127369223,
"kl": 0.04248046875,
"learning_rate": 1.134847968678575e-07,
"loss": 0.0017,
"reward": 1.6614583730697632,
"reward_std": 0.15649890899658203,
"rewards/accuracy_reward": 0.6809896230697632,
"rewards/format_reward": 0.98046875,
"step": 448
},
{
"all_correct": 0.5,
"all_wrong": 0.21875,
"completion_length": 90.0703125,
"epoch": 0.8487712665406427,
"grad_norm": 1.1591726049731972,
"kl": 0.0390625,
"learning_rate": 1.1075259494205225e-07,
"loss": 0.0016,
"reward": 1.655552625656128,
"reward_std": 0.12017819285392761,
"rewards/accuracy_reward": 0.6594588756561279,
"rewards/format_reward": 0.99609375,
"step": 449
},
{
"all_correct": 0.40625,
"all_wrong": 0.15625,
"completion_length": 92.24609375,
"epoch": 0.8506616257088847,
"grad_norm": 2.039439334103359,
"kl": 0.0419921875,
"learning_rate": 1.0805175544815648e-07,
"loss": 0.0017,
"reward": 1.6180661916732788,
"reward_std": 0.16808518767356873,
"rewards/accuracy_reward": 0.6180662512779236,
"rewards/format_reward": 1.0,
"step": 450
},
{
"all_correct": 0.40625,
"all_wrong": 0.09375,
"completion_length": 87.765625,
"epoch": 0.8525519848771267,
"grad_norm": 3.551379163758167,
"kl": 0.048095703125,
"learning_rate": 1.0538237364075786e-07,
"loss": 0.0019,
"reward": 1.7047264575958252,
"reward_std": 0.17995205521583557,
"rewards/accuracy_reward": 0.7047264575958252,
"rewards/format_reward": 1.0,
"step": 451
},
{
"all_correct": 0.3125,
"all_wrong": 0.1875,
"completion_length": 94.16796875,
"epoch": 0.8544423440453687,
"grad_norm": 2.4552903396429553,
"kl": 0.04296875,
"learning_rate": 1.0274454366497787e-07,
"loss": 0.0017,
"reward": 1.5992646217346191,
"reward_std": 0.19959133863449097,
"rewards/accuracy_reward": 0.5992645621299744,
"rewards/format_reward": 1.0,
"step": 452
},
{
"all_correct": 0.34375,
"all_wrong": 0.21875,
"completion_length": 98.84375,
"epoch": 0.8563327032136105,
"grad_norm": 1.9864652083005676,
"kl": 0.04296875,
"learning_rate": 1.0013835855315233e-07,
"loss": 0.0017,
"reward": 1.547175407409668,
"reward_std": 0.18141832947731018,
"rewards/accuracy_reward": 0.5471754670143127,
"rewards/format_reward": 1.0,
"step": 453
},
{
"all_correct": 0.375,
"all_wrong": 0.15625,
"completion_length": 91.21484375,
"epoch": 0.8582230623818525,
"grad_norm": 1.4554139396903383,
"kl": 0.04345703125,
"learning_rate": 9.756391022154953e-08,
"loss": 0.0017,
"reward": 1.6482672691345215,
"reward_std": 0.20143428444862366,
"rewards/accuracy_reward": 0.6482672095298767,
"rewards/format_reward": 1.0,
"step": 454
},
{
"all_correct": 0.40625,
"all_wrong": 0.21875,
"completion_length": 91.6875,
"epoch": 0.8601134215500945,
"grad_norm": 1.3848840834631115,
"kl": 0.05029296875,
"learning_rate": 9.502128946712862e-08,
"loss": 0.002,
"reward": 1.5533459186553955,
"reward_std": 0.1582801640033722,
"rewards/accuracy_reward": 0.5572521090507507,
"rewards/format_reward": 0.99609375,
"step": 455
},
{
"all_correct": 0.40625,
"all_wrong": 0.28125,
"completion_length": 87.30078125,
"epoch": 0.8620037807183365,
"grad_norm": 1.474494768169295,
"kl": 0.0390625,
"learning_rate": 9.251058596433792e-08,
"loss": 0.0016,
"reward": 1.566476821899414,
"reward_std": 0.08095038682222366,
"rewards/accuracy_reward": 0.5664768218994141,
"rewards/format_reward": 1.0,
"step": 456
},
{
"all_correct": 0.40625,
"all_wrong": 0.0625,
"completion_length": 92.46484375,
"epoch": 0.8638941398865785,
"grad_norm": 2.228512668796117,
"kl": 0.04150390625,
"learning_rate": 9.003188826195141e-08,
"loss": 0.0017,
"reward": 1.6768805980682373,
"reward_std": 0.21106885373592377,
"rewards/accuracy_reward": 0.6768805384635925,
"rewards/format_reward": 1.0,
"step": 457
},
{
"all_correct": 0.28125,
"all_wrong": 0.25,
"completion_length": 89.76953125,
"epoch": 0.8657844990548205,
"grad_norm": 2.381172656339536,
"kl": 0.043701171875,
"learning_rate": 8.758528377994667e-08,
"loss": 0.0017,
"reward": 1.5411908626556396,
"reward_std": 0.182044118642807,
"rewards/accuracy_reward": 0.5411908626556396,
"rewards/format_reward": 1.0,
"step": 458
},
{
"all_correct": 0.46875,
"all_wrong": 0.21875,
"completion_length": 97.53125,
"epoch": 0.8676748582230623,
"grad_norm": 1.1370034333672576,
"kl": 0.042724609375,
"learning_rate": 8.51708588064206e-08,
"loss": 0.0017,
"reward": 1.5652220249176025,
"reward_std": 0.13882781565189362,
"rewards/accuracy_reward": 0.5730345249176025,
"rewards/format_reward": 0.9921875,
"step": 459
},
{
"all_correct": 0.25,
"all_wrong": 0.28125,
"completion_length": 99.2421875,
"epoch": 0.8695652173913043,
"grad_norm": 1.9476181687483543,
"kl": 0.041259765625,
"learning_rate": 8.278869849454717e-08,
"loss": 0.0017,
"reward": 1.478024959564209,
"reward_std": 0.17786462604999542,
"rewards/accuracy_reward": 0.47802501916885376,
"rewards/format_reward": 1.0,
"step": 460
},
{
"all_correct": 0.375,
"all_wrong": 0.1875,
"completion_length": 100.20703125,
"epoch": 0.8714555765595463,
"grad_norm": 1.224210562302336,
"kl": 0.0380859375,
"learning_rate": 8.043888685957312e-08,
"loss": 0.0015,
"reward": 1.5925004482269287,
"reward_std": 0.14712047576904297,
"rewards/accuracy_reward": 0.5925004482269287,
"rewards/format_reward": 1.0,
"step": 461
},
{
"all_correct": 0.40625,
"all_wrong": 0.15625,
"completion_length": 91.64453125,
"epoch": 0.8733459357277883,
"grad_norm": 4.6192400629488395,
"kl": 0.045654296875,
"learning_rate": 7.812150677585671e-08,
"loss": 0.0018,
"reward": 1.5704599618911743,
"reward_std": 0.1768045723438263,
"rewards/accuracy_reward": 0.5743662118911743,
"rewards/format_reward": 0.99609375,
"step": 462
},
{
"all_correct": 0.375,
"all_wrong": 0.28125,
"completion_length": 94.3828125,
"epoch": 0.8752362948960303,
"grad_norm": 1.9953801863403324,
"kl": 0.049560546875,
"learning_rate": 7.58366399739424e-08,
"loss": 0.002,
"reward": 1.5050649642944336,
"reward_std": 0.11211474239826202,
"rewards/accuracy_reward": 0.5089712142944336,
"rewards/format_reward": 0.99609375,
"step": 463
},
{
"all_correct": 0.4375,
"all_wrong": 0.125,
"completion_length": 90.1640625,
"epoch": 0.8771266540642723,
"grad_norm": 1.8235799095393221,
"kl": 0.043212890625,
"learning_rate": 7.358436703768034e-08,
"loss": 0.0017,
"reward": 1.589560627937317,
"reward_std": 0.16465333104133606,
"rewards/accuracy_reward": 0.5895605683326721,
"rewards/format_reward": 1.0,
"step": 464
},
{
"all_correct": 0.4375,
"all_wrong": 0.21875,
"completion_length": 95.63671875,
"epoch": 0.8790170132325141,
"grad_norm": 1.0760041562687386,
"kl": 0.0380859375,
"learning_rate": 7.136476740138387e-08,
"loss": 0.0015,
"reward": 1.5880682468414307,
"reward_std": 0.15734529495239258,
"rewards/accuracy_reward": 0.6154119968414307,
"rewards/format_reward": 0.97265625,
"step": 465
},
{
"all_correct": 0.25,
"all_wrong": 0.25,
"completion_length": 101.12890625,
"epoch": 0.8809073724007561,
"grad_norm": 1.4738314342767895,
"kl": 0.044189453125,
"learning_rate": 6.917791934702655e-08,
"loss": 0.0018,
"reward": 1.4817919731140137,
"reward_std": 0.2077905535697937,
"rewards/accuracy_reward": 0.48179197311401367,
"rewards/format_reward": 1.0,
"step": 466
},
{
"all_correct": 0.53125,
"all_wrong": 0.15625,
"completion_length": 95.58203125,
"epoch": 0.8827977315689981,
"grad_norm": 1.0571716079445075,
"kl": 0.04296875,
"learning_rate": 6.70239000014835e-08,
"loss": 0.0017,
"reward": 1.6927690505981445,
"reward_std": 0.14029529690742493,
"rewards/accuracy_reward": 0.6966753005981445,
"rewards/format_reward": 0.99609375,
"step": 467
},
{
"all_correct": 0.3125,
"all_wrong": 0.21875,
"completion_length": 97.92578125,
"epoch": 0.8846880907372401,
"grad_norm": 1.5234291263564896,
"kl": 0.0458984375,
"learning_rate": 6.490278533380955e-08,
"loss": 0.0018,
"reward": 1.540100336074829,
"reward_std": 0.2111871838569641,
"rewards/accuracy_reward": 0.5635378360748291,
"rewards/format_reward": 0.9765625,
"step": 468
},
{
"all_correct": 0.375,
"all_wrong": 0.1875,
"completion_length": 83.59375,
"epoch": 0.8865784499054821,
"grad_norm": 2.0255739847362766,
"kl": 0.049560546875,
"learning_rate": 6.281465015256093e-08,
"loss": 0.002,
"reward": 1.5604361295700073,
"reward_std": 0.17061945796012878,
"rewards/accuracy_reward": 0.5604361891746521,
"rewards/format_reward": 1.0,
"step": 469
},
{
"all_correct": 0.40625,
"all_wrong": 0.1875,
"completion_length": 103.4921875,
"epoch": 0.888468809073724,
"grad_norm": 1.4503561746862432,
"kl": 0.044921875,
"learning_rate": 6.075956810315619e-08,
"loss": 0.0018,
"reward": 1.599052906036377,
"reward_std": 0.1590302437543869,
"rewards/accuracy_reward": 0.5990527868270874,
"rewards/format_reward": 1.0,
"step": 470
},
{
"all_correct": 0.4375,
"all_wrong": 0.15625,
"completion_length": 85.69921875,
"epoch": 0.8903591682419659,
"grad_norm": 1.9843134997693552,
"kl": 0.052001953125,
"learning_rate": 5.8737611665279355e-08,
"loss": 0.0021,
"reward": 1.624929666519165,
"reward_std": 0.15200699865818024,
"rewards/accuracy_reward": 0.624929666519165,
"rewards/format_reward": 1.0,
"step": 471
},
{
"all_correct": 0.25,
"all_wrong": 0.125,
"completion_length": 95.69921875,
"epoch": 0.8922495274102079,
"grad_norm": 1.9277786145166012,
"kl": 0.0439453125,
"learning_rate": 5.6748852150323215e-08,
"loss": 0.0018,
"reward": 1.5506601333618164,
"reward_std": 0.21757060289382935,
"rewards/accuracy_reward": 0.5506601333618164,
"rewards/format_reward": 1.0,
"step": 472
},
{
"all_correct": 0.28125,
"all_wrong": 0.25,
"completion_length": 94.9140625,
"epoch": 0.8941398865784499,
"grad_norm": 1.3171715987687558,
"kl": 0.04150390625,
"learning_rate": 5.479335969887466e-08,
"loss": 0.0017,
"reward": 1.486616611480713,
"reward_std": 0.17591437697410583,
"rewards/accuracy_reward": 0.49052292108535767,
"rewards/format_reward": 0.99609375,
"step": 473
},
{
"all_correct": 0.34375,
"all_wrong": 0.34375,
"completion_length": 92.6640625,
"epoch": 0.8960302457466919,
"grad_norm": 1.1864505239231775,
"kl": 0.05126953125,
"learning_rate": 5.2871203278240906e-08,
"loss": 0.002,
"reward": 1.528516173362732,
"reward_std": 0.11142821609973907,
"rewards/accuracy_reward": 0.5285161733627319,
"rewards/format_reward": 1.0,
"step": 474
},
{
"all_correct": 0.28125,
"all_wrong": 0.1875,
"completion_length": 99.765625,
"epoch": 0.8979206049149339,
"grad_norm": 3.793748033276946,
"kl": 0.0390625,
"learning_rate": 5.098245068001661e-08,
"loss": 0.0016,
"reward": 1.4771013259887695,
"reward_std": 0.22667381167411804,
"rewards/accuracy_reward": 0.47710129618644714,
"rewards/format_reward": 1.0,
"step": 475
},
{
"all_correct": 0.40625,
"all_wrong": 0.1875,
"completion_length": 83.51953125,
"epoch": 0.8998109640831758,
"grad_norm": 6.850505412934267,
"kl": 0.044921875,
"learning_rate": 4.912716851769394e-08,
"loss": 0.0018,
"reward": 1.649026870727539,
"reward_std": 0.15467888116836548,
"rewards/accuracy_reward": 0.6490268707275391,
"rewards/format_reward": 1.0,
"step": 476
},
{
"all_correct": 0.34375,
"all_wrong": 0.1875,
"completion_length": 87.7109375,
"epoch": 0.9017013232514177,
"grad_norm": 3.519242696315356,
"kl": 0.044677734375,
"learning_rate": 4.730542222431222e-08,
"loss": 0.0018,
"reward": 1.6143535375595093,
"reward_std": 0.17040878534317017,
"rewards/accuracy_reward": 0.6143535375595093,
"rewards/format_reward": 1.0,
"step": 477
},
{
"all_correct": 0.34375,
"all_wrong": 0.1875,
"completion_length": 90.99609375,
"epoch": 0.9035916824196597,
"grad_norm": 3.880434261673006,
"kl": 0.038818359375,
"learning_rate": 4.5517276050150325e-08,
"loss": 0.0016,
"reward": 1.5734894275665283,
"reward_std": 0.16900530457496643,
"rewards/accuracy_reward": 0.5734893679618835,
"rewards/format_reward": 1.0,
"step": 478
},
{
"all_correct": 0.4375,
"all_wrong": 0.0625,
"completion_length": 90.35546875,
"epoch": 0.9054820415879017,
"grad_norm": 1.8707086039575145,
"kl": 0.041015625,
"learning_rate": 4.3762793060461824e-08,
"loss": 0.0016,
"reward": 1.6607571840286255,
"reward_std": 0.23725268244743347,
"rewards/accuracy_reward": 0.6763821840286255,
"rewards/format_reward": 0.984375,
"step": 479
},
{
"all_correct": 0.25,
"all_wrong": 0.1875,
"completion_length": 87.125,
"epoch": 0.9073724007561437,
"grad_norm": 2.978657513471123,
"kl": 0.048583984375,
"learning_rate": 4.2042035133248885e-08,
"loss": 0.0019,
"reward": 1.4891417026519775,
"reward_std": 0.2210281491279602,
"rewards/accuracy_reward": 0.48914170265197754,
"rewards/format_reward": 1.0,
"step": 480
},
{
"all_correct": 0.21875,
"all_wrong": 0.21875,
"completion_length": 88.83984375,
"epoch": 0.9092627599243857,
"grad_norm": 1.9106697468452067,
"kl": 0.0458984375,
"learning_rate": 4.035506295708191e-08,
"loss": 0.0018,
"reward": 1.4589961767196655,
"reward_std": 0.21063633263111115,
"rewards/accuracy_reward": 0.46290236711502075,
"rewards/format_reward": 0.99609375,
"step": 481
},
{
"all_correct": 0.40625,
"all_wrong": 0.15625,
"completion_length": 95.41796875,
"epoch": 0.9111531190926276,
"grad_norm": 3.468786099498894,
"kl": 0.046875,
"learning_rate": 3.870193602895733e-08,
"loss": 0.0019,
"reward": 1.6448715925216675,
"reward_std": 0.16561608016490936,
"rewards/accuracy_reward": 0.6448715329170227,
"rewards/format_reward": 1.0,
"step": 482
},
{
"all_correct": 0.46875,
"all_wrong": 0.15625,
"completion_length": 96.2109375,
"epoch": 0.9130434782608695,
"grad_norm": 2.104789227248534,
"kl": 0.0361328125,
"learning_rate": 3.708271265220087e-08,
"loss": 0.0015,
"reward": 1.6773532629013062,
"reward_std": 0.16596126556396484,
"rewards/accuracy_reward": 0.7046970725059509,
"rewards/format_reward": 0.97265625,
"step": 483
},
{
"all_correct": 0.3125,
"all_wrong": 0.28125,
"completion_length": 89.72265625,
"epoch": 0.9149338374291115,
"grad_norm": 6.692599057258857,
"kl": 0.042236328125,
"learning_rate": 3.5497449934409396e-08,
"loss": 0.0017,
"reward": 1.45703125,
"reward_std": 0.19940373301506042,
"rewards/accuracy_reward": 0.46484375,
"rewards/format_reward": 0.9921875,
"step": 484
},
{
"all_correct": 0.5,
"all_wrong": 0.15625,
"completion_length": 89.53515625,
"epoch": 0.9168241965973535,
"grad_norm": 1.8841313081503308,
"kl": 0.05078125,
"learning_rate": 3.394620378543911e-08,
"loss": 0.002,
"reward": 1.6709468364715576,
"reward_std": 0.13965073227882385,
"rewards/accuracy_reward": 0.6826655268669128,
"rewards/format_reward": 0.98828125,
"step": 485
},
{
"all_correct": 0.40625,
"all_wrong": 0.25,
"completion_length": 86.93359375,
"epoch": 0.9187145557655955,
"grad_norm": 1.673566002692786,
"kl": 0.046142578125,
"learning_rate": 3.2429028915431534e-08,
"loss": 0.0018,
"reward": 1.5806677341461182,
"reward_std": 0.13359013199806213,
"rewards/accuracy_reward": 0.5806676149368286,
"rewards/format_reward": 1.0,
"step": 486
},
{
"all_correct": 0.34375,
"all_wrong": 0.1875,
"completion_length": 88.890625,
"epoch": 0.9206049149338374,
"grad_norm": 4.461429054725898,
"kl": 0.041259765625,
"learning_rate": 3.094597883288574e-08,
"loss": 0.0016,
"reward": 1.56640625,
"reward_std": 0.2223491370677948,
"rewards/accuracy_reward": 0.5703125,
"rewards/format_reward": 0.99609375,
"step": 487
},
{
"all_correct": 0.34375,
"all_wrong": 0.09375,
"completion_length": 99.546875,
"epoch": 0.9224952741020794,
"grad_norm": 2.074395570322666,
"kl": 0.04638671875,
"learning_rate": 2.9497105842769433e-08,
"loss": 0.0019,
"reward": 1.6248832941055298,
"reward_std": 0.2539004683494568,
"rewards/accuracy_reward": 0.636601984500885,
"rewards/format_reward": 0.98828125,
"step": 488
},
{
"all_correct": 0.34375,
"all_wrong": 0.25,
"completion_length": 84.6328125,
"epoch": 0.9243856332703214,
"grad_norm": 1.4101059267718767,
"kl": 0.04833984375,
"learning_rate": 2.808246104467582e-08,
"loss": 0.0019,
"reward": 1.5731714963912964,
"reward_std": 0.1267606019973755,
"rewards/accuracy_reward": 0.5731715559959412,
"rewards/format_reward": 1.0,
"step": 489
},
{
"all_correct": 0.34375,
"all_wrong": 0.125,
"completion_length": 89.83203125,
"epoch": 0.9262759924385633,
"grad_norm": 2.2978085302274267,
"kl": 0.04541015625,
"learning_rate": 2.6702094331020886e-08,
"loss": 0.0018,
"reward": 1.6596397161483765,
"reward_std": 0.18694524466991425,
"rewards/accuracy_reward": 0.6596397161483765,
"rewards/format_reward": 1.0,
"step": 490
},
{
"all_correct": 0.4375,
"all_wrong": 0.28125,
"completion_length": 91.2265625,
"epoch": 0.9281663516068053,
"grad_norm": 1.207942110015228,
"kl": 0.044677734375,
"learning_rate": 2.5356054385282766e-08,
"loss": 0.0018,
"reward": 1.611553430557251,
"reward_std": 0.09651355445384979,
"rewards/accuracy_reward": 0.611553430557251,
"rewards/format_reward": 1.0,
"step": 491
},
{
"all_correct": 0.4375,
"all_wrong": 0.0625,
"completion_length": 100.76953125,
"epoch": 0.9300567107750473,
"grad_norm": 1.46194048244851,
"kl": 0.039794921875,
"learning_rate": 2.4044388680286575e-08,
"loss": 0.0016,
"reward": 1.69921875,
"reward_std": 0.25242602825164795,
"rewards/accuracy_reward": 0.7109375,
"rewards/format_reward": 0.98828125,
"step": 492
},
{
"all_correct": 0.4375,
"all_wrong": 0.03125,
"completion_length": 103.296875,
"epoch": 0.9319470699432892,
"grad_norm": 1.6142721258162138,
"kl": 0.03857421875,
"learning_rate": 2.2767143476528306e-08,
"loss": 0.0015,
"reward": 1.6534472703933716,
"reward_std": 0.2080773115158081,
"rewards/accuracy_reward": 0.6690722107887268,
"rewards/format_reward": 0.984375,
"step": 493
},
{
"all_correct": 0.4375,
"all_wrong": 0.0625,
"completion_length": 96.51171875,
"epoch": 0.9338374291115312,
"grad_norm": 1.5128739509537954,
"kl": 0.037841796875,
"learning_rate": 2.152436382054479e-08,
"loss": 0.0015,
"reward": 1.6529420614242554,
"reward_std": 0.22081422805786133,
"rewards/accuracy_reward": 0.6607545614242554,
"rewards/format_reward": 0.9921875,
"step": 494
},
{
"all_correct": 0.4375,
"all_wrong": 0.1875,
"completion_length": 88.4765625,
"epoch": 0.9357277882797732,
"grad_norm": 1.4350308350145478,
"kl": 0.05126953125,
"learning_rate": 2.0316093543323753e-08,
"loss": 0.0021,
"reward": 1.5785757303237915,
"reward_std": 0.1358921080827713,
"rewards/accuracy_reward": 0.5785757303237915,
"rewards/format_reward": 1.0,
"step": 495
},
{
"all_correct": 0.34375,
"all_wrong": 0.28125,
"completion_length": 91.57421875,
"epoch": 0.9376181474480151,
"grad_norm": 1.4943511024476006,
"kl": 0.042236328125,
"learning_rate": 1.914237525875917e-08,
"loss": 0.0017,
"reward": 1.4834184646606445,
"reward_std": 0.1201879009604454,
"rewards/accuracy_reward": 0.48341846466064453,
"rewards/format_reward": 1.0,
"step": 496
},
{
"all_correct": 0.5,
"all_wrong": 0.1875,
"completion_length": 90.48046875,
"epoch": 0.9395085066162571,
"grad_norm": 1.2851391631205682,
"kl": 0.040283203125,
"learning_rate": 1.8003250362147004e-08,
"loss": 0.0016,
"reward": 1.651926040649414,
"reward_std": 0.11862440407276154,
"rewards/accuracy_reward": 0.6519260406494141,
"rewards/format_reward": 1.0,
"step": 497
},
{
"all_correct": 0.34375,
"all_wrong": 0.25,
"completion_length": 91.71484375,
"epoch": 0.941398865784499,
"grad_norm": 3.68427646191117,
"kl": 0.04296875,
"learning_rate": 1.6898759028726283e-08,
"loss": 0.0017,
"reward": 1.55078125,
"reward_std": 0.19926638901233673,
"rewards/accuracy_reward": 0.5546875,
"rewards/format_reward": 0.99609375,
"step": 498
},
{
"all_correct": 0.28125,
"all_wrong": 0.25,
"completion_length": 88.03515625,
"epoch": 0.943289224952741,
"grad_norm": 1.9461501149892964,
"kl": 0.045654296875,
"learning_rate": 1.5828940212261887e-08,
"loss": 0.0018,
"reward": 1.4915789365768433,
"reward_std": 0.1742965131998062,
"rewards/accuracy_reward": 0.49157893657684326,
"rewards/format_reward": 1.0,
"step": 499
},
{
"all_correct": 0.3125,
"all_wrong": 0.21875,
"completion_length": 104.78515625,
"epoch": 0.945179584120983,
"grad_norm": 1.4451954837871217,
"kl": 0.040283203125,
"learning_rate": 1.4793831643670429e-08,
"loss": 0.0016,
"reward": 1.5099118947982788,
"reward_std": 0.25256532430648804,
"rewards/accuracy_reward": 0.5294432044029236,
"rewards/format_reward": 0.98046875,
"step": 500
},
{
"all_correct": 0.46875,
"all_wrong": 0.21875,
"completion_length": 100.34765625,
"epoch": 0.947069943289225,
"grad_norm": 1.6974654768712778,
"kl": 0.039794921875,
"learning_rate": 1.3793469829689986e-08,
"loss": 0.0016,
"reward": 1.6564844846725464,
"reward_std": 0.11369632184505463,
"rewards/accuracy_reward": 0.6564844846725464,
"rewards/format_reward": 1.0,
"step": 501
},
{
"all_correct": 0.34375,
"all_wrong": 0.1875,
"completion_length": 90.28515625,
"epoch": 0.9489603024574669,
"grad_norm": 1.725404425317813,
"kl": 0.040283203125,
"learning_rate": 1.2827890051592127e-08,
"loss": 0.0016,
"reward": 1.6119554042816162,
"reward_std": 0.16685199737548828,
"rewards/accuracy_reward": 0.6119554042816162,
"rewards/format_reward": 1.0,
"step": 502
},
{
"all_correct": 0.3125,
"all_wrong": 0.21875,
"completion_length": 100.02734375,
"epoch": 0.9508506616257089,
"grad_norm": 1.6905254249447337,
"kl": 0.04150390625,
"learning_rate": 1.1897126363937803e-08,
"loss": 0.0017,
"reward": 1.6142412424087524,
"reward_std": 0.18831773102283478,
"rewards/accuracy_reward": 0.6142412424087524,
"rewards/format_reward": 1.0,
"step": 503
},
{
"all_correct": 0.40625,
"all_wrong": 0.15625,
"completion_length": 97.78125,
"epoch": 0.9527410207939508,
"grad_norm": 1.4091412694145533,
"kl": 0.0390625,
"learning_rate": 1.1001211593376525e-08,
"loss": 0.0016,
"reward": 1.5735445022583008,
"reward_std": 0.17769688367843628,
"rewards/accuracy_reward": 0.577450692653656,
"rewards/format_reward": 0.99609375,
"step": 504
},
{
"all_correct": 0.40625,
"all_wrong": 0.1875,
"completion_length": 81.171875,
"epoch": 0.9546313799621928,
"grad_norm": 2.0493717412947645,
"kl": 0.04443359375,
"learning_rate": 1.0140177337488287e-08,
"loss": 0.0018,
"reward": 1.6359052658081055,
"reward_std": 0.15704122185707092,
"rewards/accuracy_reward": 0.6359052062034607,
"rewards/format_reward": 1.0,
"step": 505
},
{
"all_correct": 0.1875,
"all_wrong": 0.1875,
"completion_length": 91.32421875,
"epoch": 0.9565217391304348,
"grad_norm": 3.7195758010441407,
"kl": 0.047607421875,
"learning_rate": 9.314053963669244e-09,
"loss": 0.0019,
"reward": 1.5355010032653809,
"reward_std": 0.20620205998420715,
"rewards/accuracy_reward": 0.5355010628700256,
"rewards/format_reward": 1.0,
"step": 506
},
{
"all_correct": 0.40625,
"all_wrong": 0.125,
"completion_length": 93.41796875,
"epoch": 0.9584120982986768,
"grad_norm": 4.273675771479045,
"kl": 0.0419921875,
"learning_rate": 8.522870608060562e-09,
"loss": 0.0017,
"reward": 1.6529306173324585,
"reward_std": 0.17892761528491974,
"rewards/accuracy_reward": 0.6529306173324585,
"rewards/format_reward": 1.0,
"step": 507
},
{
"all_correct": 0.4375,
"all_wrong": 0.15625,
"completion_length": 95.71875,
"epoch": 0.9603024574669187,
"grad_norm": 1.471506691627891,
"kl": 0.045166015625,
"learning_rate": 7.766655174521464e-09,
"loss": 0.0018,
"reward": 1.6168668270111084,
"reward_std": 0.13944411277770996,
"rewards/accuracy_reward": 0.6168668270111084,
"rewards/format_reward": 1.0,
"step": 508
},
{
"all_correct": 0.25,
"all_wrong": 0.21875,
"completion_length": 94.5,
"epoch": 0.9621928166351607,
"grad_norm": 3.051208934788439,
"kl": 0.042236328125,
"learning_rate": 7.045434333643796e-09,
"loss": 0.0017,
"reward": 1.5761425495147705,
"reward_std": 0.22245533764362335,
"rewards/accuracy_reward": 0.5761424899101257,
"rewards/format_reward": 1.0,
"step": 509
},
{
"all_correct": 0.28125,
"all_wrong": 0.15625,
"completion_length": 90.06640625,
"epoch": 0.9640831758034026,
"grad_norm": 1.9014046704475331,
"kl": 0.042724609375,
"learning_rate": 6.3592335218132235e-09,
"loss": 0.0017,
"reward": 1.5352835655212402,
"reward_std": 0.22351181507110596,
"rewards/accuracy_reward": 0.5352836847305298,
"rewards/format_reward": 1.0,
"step": 510
}
],
"logging_steps": 1.0,
"max_steps": 529,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 510,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}