diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,84034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8, + "eval_steps": 500, + "global_step": 6000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 76.125, + "epoch": 0.00013333333333333334, + "grad_norm": 10.941964149475098, + "kl": 0.001125335693359375, + "learning_rate": 9.999333333333334e-07, + "loss": 0.0, + "reward": 0.1875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0625, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.0625, + "epoch": 0.0002666666666666667, + "grad_norm": 10.857356071472168, + "kl": 0.000606536865234375, + "learning_rate": 9.998666666666665e-07, + "loss": 0.0, + "reward": 0.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.1875, + "step": 2 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.3125, + "epoch": 0.0004, + "grad_norm": 164.67886352539062, + "kl": 0.006717681884765625, + "learning_rate": 9.998e-07, + "loss": 0.0003, + "reward": 0.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.125, + "step": 3 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.9375, + "epoch": 0.0005333333333333334, + "grad_norm": 77.5731201171875, + "kl": 0.004058837890625, + "learning_rate": 9.997333333333333e-07, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.875, + "epoch": 0.0006666666666666666, + "grad_norm": 10.139974594116211, + "kl": 0.001392364501953125, + "learning_rate": 9.996666666666667e-07, + "loss": 0.0001, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0625, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.4375, + "epoch": 0.0008, + "grad_norm": 17.13385772705078, + "kl": 0.002094268798828125, + "learning_rate": 9.996e-07, + "loss": 0.0001, + "reward": 0.375, + "reward_std": 0.6943650841712952, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.1875, + "step": 6 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.5, + "epoch": 0.0009333333333333333, + "grad_norm": 43.97864532470703, + "kl": 0.0108489990234375, + "learning_rate": 9.995333333333333e-07, + "loss": 0.0004, + "reward": 0.1875, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.125, + "step": 7 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.25, + "epoch": 0.0010666666666666667, + "grad_norm": 24.432096481323242, + "kl": 0.00972747802734375, + "learning_rate": 9.994666666666665e-07, + "loss": 0.0004, + "reward": 0.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.1875, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.0625, + "epoch": 0.0012, + "grad_norm": 26.501384735107422, + "kl": 0.015380859375, + "learning_rate": 9.994e-07, + "loss": 0.0006, + "reward": 0.375, + "reward_std": 0.7315178513526917, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.25, + "step": 9 + }, + { + "clip_ratio": 0.0, + "completion_length": 42.4375, + "epoch": 0.0013333333333333333, + "grad_norm": 36.91240310668945, + "kl": 0.024383544921875, + "learning_rate": 9.993333333333333e-07, + "loss": 0.001, + "reward": 0.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.3125, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.375, + "epoch": 0.0014666666666666667, + "grad_norm": 283.3633117675781, + "kl": 0.0240631103515625, + "learning_rate": 9.992666666666665e-07, + "loss": 0.001, + "reward": 0.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.125, + "step": 11 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.25, + "epoch": 0.0016, + "grad_norm": 10.271690368652344, + "kl": 0.00750732421875, + "learning_rate": 9.992e-07, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.125, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.0625, + "epoch": 0.0017333333333333333, + "grad_norm": 13.191391944885254, + "kl": 0.0401611328125, + "learning_rate": 9.991333333333333e-07, + "loss": 0.0016, + "reward": 0.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.25, + "step": 13 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.5, + "epoch": 0.0018666666666666666, + "grad_norm": 24.420692443847656, + "kl": 0.05029296875, + "learning_rate": 9.990666666666667e-07, + "loss": 0.002, + "reward": 0.625, + "reward_std": 0.7315178513526917, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.3125, + "step": 14 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.0625, + "epoch": 0.002, + "grad_norm": 83.60469818115234, + "kl": 0.099029541015625, + "learning_rate": 9.989999999999999e-07, + "loss": 0.0039, + "reward": 0.6875, + "reward_std": 0.7216846346855164, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.5, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.3125, + "epoch": 0.0021333333333333334, + "grad_norm": 13.748204231262207, + "kl": 0.02899169921875, + "learning_rate": 9.989333333333333e-07, + "loss": 0.0012, + "reward": 0.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.6875, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.125, + "epoch": 0.002266666666666667, + "grad_norm": 56.48728561401367, + "kl": 0.04083251953125, + "learning_rate": 9.988666666666667e-07, + "loss": 0.0016, + "reward": 0.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.375, + "step": 17 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.3125, + "epoch": 0.0024, + "grad_norm": 17.066789627075195, + "kl": 0.0965576171875, + "learning_rate": 9.988e-07, + "loss": 0.0039, + "reward": 0.6875, + "reward_std": 0.8152145147323608, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.4375, + "step": 18 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.625, + "epoch": 0.002533333333333333, + "grad_norm": 18.78993797302246, + "kl": 0.0723876953125, + "learning_rate": 9.987333333333333e-07, + "loss": 0.0029, + "reward": 1.125, + "reward_std": 0.8345229625701904, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.6875, + "step": 19 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.5625, + "epoch": 0.0026666666666666666, + "grad_norm": 112.30597686767578, + "kl": 0.1170654296875, + "learning_rate": 9.986666666666667e-07, + "loss": 0.0047, + "reward": 0.875, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.75, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.125, + "epoch": 0.0028, + "grad_norm": 11.310404777526855, + "kl": 0.03173828125, + "learning_rate": 9.986e-07, + "loss": 0.0013, + "reward": 0.875, + "reward_std": 0.7784976959228516, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.5625, + "step": 21 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.625, + "epoch": 0.0029333333333333334, + "grad_norm": 73.48896789550781, + "kl": 0.0684814453125, + "learning_rate": 9.985333333333332e-07, + "loss": 0.0027, + "reward": 1.1875, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.8125, + "step": 22 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.4375, + "epoch": 0.0030666666666666668, + "grad_norm": 26.2327938079834, + "kl": 0.122314453125, + "learning_rate": 9.984666666666666e-07, + "loss": 0.0049, + "reward": 1.125, + "reward_std": 0.5940381735563278, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.875, + "step": 23 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.6875, + "epoch": 0.0032, + "grad_norm": 62.2012939453125, + "kl": 0.0740966796875, + "learning_rate": 9.983999999999998e-07, + "loss": 0.003, + "reward": 1.3125, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.25, + "epoch": 0.0033333333333333335, + "grad_norm": 18.201658248901367, + "kl": 0.099853515625, + "learning_rate": 9.983333333333332e-07, + "loss": 0.004, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.3125, + "epoch": 0.0034666666666666665, + "grad_norm": 15.408669471740723, + "kl": 0.122314453125, + "learning_rate": 9.982666666666666e-07, + "loss": 0.0049, + "reward": 1.125, + "reward_std": 0.5940381735563278, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.875, + "step": 26 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.75, + "epoch": 0.0036, + "grad_norm": 11.711745262145996, + "kl": 0.07080078125, + "learning_rate": 9.982e-07, + "loss": 0.0028, + "reward": 1.3125, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 27 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.0, + "epoch": 0.0037333333333333333, + "grad_norm": 10.49715518951416, + "kl": 0.18701171875, + "learning_rate": 9.981333333333332e-07, + "loss": 0.0075, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.375, + "epoch": 0.0038666666666666667, + "grad_norm": 14.67177677154541, + "kl": 0.099609375, + "learning_rate": 9.980666666666666e-07, + "loss": 0.004, + "reward": 0.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.6875, + "step": 29 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.9375, + "epoch": 0.004, + "grad_norm": 15.351900100708008, + "kl": 0.13818359375, + "learning_rate": 9.98e-07, + "loss": 0.0055, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.875, + "epoch": 0.0041333333333333335, + "grad_norm": 69.30094909667969, + "kl": 0.21337890625, + "learning_rate": 9.979333333333332e-07, + "loss": 0.0086, + "reward": 1.1875, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 31 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.625, + "epoch": 0.004266666666666667, + "grad_norm": 10.869234085083008, + "kl": 0.07763671875, + "learning_rate": 9.978666666666666e-07, + "loss": 0.0031, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.875, + "epoch": 0.0044, + "grad_norm": 12.941633224487305, + "kl": 0.17431640625, + "learning_rate": 9.978e-07, + "loss": 0.007, + "reward": 1.375, + "reward_std": 0.579209566116333, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 33 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.625, + "epoch": 0.004533333333333334, + "grad_norm": 21.701663970947266, + "kl": 0.1142578125, + "learning_rate": 9.977333333333334e-07, + "loss": 0.0046, + "reward": 1.1875, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 34 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.1875, + "epoch": 0.004666666666666667, + "grad_norm": 8.196089744567871, + "kl": 0.11328125, + "learning_rate": 9.976666666666666e-07, + "loss": 0.0045, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.25, + "epoch": 0.0048, + "grad_norm": 21.13199234008789, + "kl": 0.24853515625, + "learning_rate": 9.976e-07, + "loss": 0.0099, + "reward": 1.4375, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.4375, + "epoch": 0.004933333333333333, + "grad_norm": 16.43806266784668, + "kl": 0.121826171875, + "learning_rate": 9.975333333333334e-07, + "loss": 0.0049, + "reward": 1.25, + "reward_std": 0.7967559993267059, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.8125, + "step": 37 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.0625, + "epoch": 0.005066666666666666, + "grad_norm": 125.96072387695312, + "kl": 1.56982421875, + "learning_rate": 9.974666666666666e-07, + "loss": 0.0628, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 38 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.375, + "epoch": 0.0052, + "grad_norm": 23.35045051574707, + "kl": 0.283203125, + "learning_rate": 9.974e-07, + "loss": 0.0113, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 39 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.3125, + "epoch": 0.005333333333333333, + "grad_norm": 15.454157829284668, + "kl": 0.184814453125, + "learning_rate": 9.973333333333332e-07, + "loss": 0.0074, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.5625, + "epoch": 0.0054666666666666665, + "grad_norm": 21.09633445739746, + "kl": 0.201416015625, + "learning_rate": 9.972666666666666e-07, + "loss": 0.0081, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 41 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.5, + "epoch": 0.0056, + "grad_norm": 20.7762393951416, + "kl": 0.24560546875, + "learning_rate": 9.972e-07, + "loss": 0.0098, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 42 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.8125, + "epoch": 0.005733333333333333, + "grad_norm": 15.187784194946289, + "kl": 0.22900390625, + "learning_rate": 9.971333333333334e-07, + "loss": 0.0092, + "reward": 1.4375, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 43 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.625, + "epoch": 0.005866666666666667, + "grad_norm": 13.349211692810059, + "kl": 0.168701171875, + "learning_rate": 9.970666666666665e-07, + "loss": 0.0067, + "reward": 1.3125, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.8125, + "epoch": 0.006, + "grad_norm": 19.770219802856445, + "kl": 0.5419921875, + "learning_rate": 9.97e-07, + "loss": 0.0217, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.875, + "epoch": 0.0061333333333333335, + "grad_norm": 26.872610092163086, + "kl": 0.22802734375, + "learning_rate": 9.969333333333333e-07, + "loss": 0.0092, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 46 + }, + { + "clip_ratio": 0.0, + "completion_length": 44.3125, + "epoch": 0.006266666666666667, + "grad_norm": 19.029434204101562, + "kl": 0.37890625, + "learning_rate": 9.968666666666667e-07, + "loss": 0.0152, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 47 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.5, + "epoch": 0.0064, + "grad_norm": 17.837528228759766, + "kl": 0.34716796875, + "learning_rate": 9.968e-07, + "loss": 0.0139, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.125, + "epoch": 0.006533333333333334, + "grad_norm": 22.359027862548828, + "kl": 0.482421875, + "learning_rate": 9.967333333333333e-07, + "loss": 0.0193, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 49 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.25, + "epoch": 0.006666666666666667, + "grad_norm": 29.919403076171875, + "kl": 0.466796875, + "learning_rate": 9.966666666666667e-07, + "loss": 0.0186, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.875, + "epoch": 0.0068, + "grad_norm": 11.768159866333008, + "kl": 0.419921875, + "learning_rate": 9.966e-07, + "loss": 0.0167, + "reward": 1.125, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 51 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.625, + "epoch": 0.006933333333333333, + "grad_norm": 15.612460136413574, + "kl": 0.478515625, + "learning_rate": 9.965333333333333e-07, + "loss": 0.0192, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completion_length": 48.75, + "epoch": 0.007066666666666666, + "grad_norm": 14.577803611755371, + "kl": 0.5224609375, + "learning_rate": 9.964666666666665e-07, + "loss": 0.0209, + "reward": 1.0625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 53 + }, + { + "clip_ratio": 0.0, + "completion_length": 42.1875, + "epoch": 0.0072, + "grad_norm": 16.610231399536133, + "kl": 0.3974609375, + "learning_rate": 9.964e-07, + "loss": 0.0159, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 54 + }, + { + "clip_ratio": 0.0, + "completion_length": 48.6875, + "epoch": 0.007333333333333333, + "grad_norm": 18.6441707611084, + "kl": 0.6025390625, + "learning_rate": 9.963333333333333e-07, + "loss": 0.0241, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.25, + "epoch": 0.007466666666666667, + "grad_norm": 17.215923309326172, + "kl": 0.5205078125, + "learning_rate": 9.962666666666667e-07, + "loss": 0.0208, + "reward": 1.375, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completion_length": 18.6875, + "epoch": 0.0076, + "grad_norm": 42.307430267333984, + "kl": 0.7421875, + "learning_rate": 9.961999999999999e-07, + "loss": 0.0296, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 57 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.6875, + "epoch": 0.007733333333333333, + "grad_norm": 68.95027160644531, + "kl": 0.716796875, + "learning_rate": 9.961333333333333e-07, + "loss": 0.0287, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 58 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.375, + "epoch": 0.007866666666666666, + "grad_norm": 50.68024444580078, + "kl": 0.751953125, + "learning_rate": 9.960666666666667e-07, + "loss": 0.0301, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 59 + }, + { + "clip_ratio": 0.0, + "completion_length": 18.125, + "epoch": 0.008, + "grad_norm": 24.332618713378906, + "kl": 0.6640625, + "learning_rate": 9.959999999999999e-07, + "loss": 0.0265, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completion_length": 16.4375, + "epoch": 0.008133333333333333, + "grad_norm": 18.180002212524414, + "kl": 0.671875, + "learning_rate": 9.959333333333333e-07, + "loss": 0.0269, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 61 + }, + { + "clip_ratio": 0.0, + "completion_length": 32.1875, + "epoch": 0.008266666666666667, + "grad_norm": 35.451847076416016, + "kl": 0.521484375, + "learning_rate": 9.958666666666667e-07, + "loss": 0.0209, + "reward": 1.3125, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 62 + }, + { + "clip_ratio": 0.0, + "completion_length": 13.875, + "epoch": 0.0084, + "grad_norm": 24.36311912536621, + "kl": 0.685546875, + "learning_rate": 9.958e-07, + "loss": 0.0274, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 63 + }, + { + "clip_ratio": 0.0, + "completion_length": 30.125, + "epoch": 0.008533333333333334, + "grad_norm": 11.256855964660645, + "kl": 1.015625, + "learning_rate": 9.957333333333332e-07, + "loss": 0.0405, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completion_length": 14.5, + "epoch": 0.008666666666666666, + "grad_norm": 69.02678680419922, + "kl": 1.115234375, + "learning_rate": 9.956666666666666e-07, + "loss": 0.0446, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completion_length": 19.9375, + "epoch": 0.0088, + "grad_norm": 26.379562377929688, + "kl": 0.66796875, + "learning_rate": 9.956e-07, + "loss": 0.0267, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 66 + }, + { + "clip_ratio": 0.0, + "completion_length": 28.5, + "epoch": 0.008933333333333333, + "grad_norm": 20.60955810546875, + "kl": 0.6474609375, + "learning_rate": 9.955333333333332e-07, + "loss": 0.0259, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 67 + }, + { + "clip_ratio": 0.0, + "completion_length": 15.4375, + "epoch": 0.009066666666666667, + "grad_norm": 30.589616775512695, + "kl": 1.328125, + "learning_rate": 9.954666666666666e-07, + "loss": 0.0532, + "reward": 1.25, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 68 + }, + { + "clip_ratio": 0.0, + "completion_length": 20.4375, + "epoch": 0.0092, + "grad_norm": 30.553367614746094, + "kl": 1.09765625, + "learning_rate": 9.953999999999998e-07, + "loss": 0.0439, + "reward": 1.4375, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 69 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.125, + "epoch": 0.009333333333333334, + "grad_norm": 55.440711975097656, + "kl": 1.6640625, + "learning_rate": 9.953333333333332e-07, + "loss": 0.0666, + "reward": 1.4375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completion_length": 23.9375, + "epoch": 0.009466666666666667, + "grad_norm": 19.985492706298828, + "kl": 0.62109375, + "learning_rate": 9.952666666666666e-07, + "loss": 0.0248, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 71 + }, + { + "clip_ratio": 0.0, + "completion_length": 14.5625, + "epoch": 0.0096, + "grad_norm": 244.921630859375, + "kl": 0.990234375, + "learning_rate": 9.952e-07, + "loss": 0.0396, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 72 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.375, + "epoch": 0.009733333333333333, + "grad_norm": 336.99395751953125, + "kl": 0.98046875, + "learning_rate": 9.951333333333332e-07, + "loss": 0.0391, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 73 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.3125, + "epoch": 0.009866666666666666, + "grad_norm": 35.66959762573242, + "kl": 0.66015625, + "learning_rate": 9.950666666666666e-07, + "loss": 0.0264, + "reward": 1.1875, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 74 + }, + { + "clip_ratio": 0.0, + "completion_length": 20.625, + "epoch": 0.01, + "grad_norm": 21.033344268798828, + "kl": 0.42578125, + "learning_rate": 9.95e-07, + "loss": 0.0171, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completion_length": 18.9375, + "epoch": 0.010133333333333333, + "grad_norm": 20.52638816833496, + "kl": 0.7109375, + "learning_rate": 9.949333333333332e-07, + "loss": 0.0285, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 76 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.8125, + "epoch": 0.010266666666666667, + "grad_norm": 16.176639556884766, + "kl": 0.986328125, + "learning_rate": 9.948666666666666e-07, + "loss": 0.0395, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 77 + }, + { + "clip_ratio": 0.0, + "completion_length": 15.625, + "epoch": 0.0104, + "grad_norm": 26.628711700439453, + "kl": 1.15625, + "learning_rate": 9.948e-07, + "loss": 0.0461, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 78 + }, + { + "clip_ratio": 0.0, + "completion_length": 14.5, + "epoch": 0.010533333333333334, + "grad_norm": 36.08820343017578, + "kl": 0.6171875, + "learning_rate": 9.947333333333334e-07, + "loss": 0.0247, + "reward": 1.4375, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 79 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.375, + "epoch": 0.010666666666666666, + "grad_norm": 16.42845916748047, + "kl": 0.8984375, + "learning_rate": 9.946666666666666e-07, + "loss": 0.0359, + "reward": 1.4375, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.0625, + "epoch": 0.0108, + "grad_norm": 23.545499801635742, + "kl": 0.962890625, + "learning_rate": 9.946e-07, + "loss": 0.0384, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 81 + }, + { + "clip_ratio": 0.0, + "completion_length": 20.75, + "epoch": 0.010933333333333333, + "grad_norm": 18.617446899414062, + "kl": 0.908203125, + "learning_rate": 9.945333333333334e-07, + "loss": 0.0363, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 82 + }, + { + "clip_ratio": 0.0, + "completion_length": 23.6875, + "epoch": 0.011066666666666667, + "grad_norm": 37.40855407714844, + "kl": 0.775390625, + "learning_rate": 9.944666666666668e-07, + "loss": 0.031, + "reward": 1.0625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 83 + }, + { + "clip_ratio": 0.0, + "completion_length": 31.75, + "epoch": 0.0112, + "grad_norm": 31.392597198486328, + "kl": 0.716796875, + "learning_rate": 9.944e-07, + "loss": 0.0287, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 84 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.375, + "epoch": 0.011333333333333334, + "grad_norm": 19.989294052124023, + "kl": 0.595703125, + "learning_rate": 9.943333333333331e-07, + "loss": 0.0238, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.8125, + "epoch": 0.011466666666666667, + "grad_norm": 12.358150482177734, + "kl": 0.685546875, + "learning_rate": 9.942666666666665e-07, + "loss": 0.0273, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 86 + }, + { + "clip_ratio": 0.0, + "completion_length": 24.25, + "epoch": 0.0116, + "grad_norm": 47.13404083251953, + "kl": 0.6357421875, + "learning_rate": 9.942e-07, + "loss": 0.0255, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 87 + }, + { + "clip_ratio": 0.0, + "completion_length": 19.9375, + "epoch": 0.011733333333333333, + "grad_norm": 53.125789642333984, + "kl": 0.7890625, + "learning_rate": 9.941333333333333e-07, + "loss": 0.0316, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completion_length": 26.5, + "epoch": 0.011866666666666666, + "grad_norm": 17.871580123901367, + "kl": 0.921875, + "learning_rate": 9.940666666666665e-07, + "loss": 0.0368, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 89 + }, + { + "clip_ratio": 0.0, + "completion_length": 22.75, + "epoch": 0.012, + "grad_norm": 20.434871673583984, + "kl": 0.853515625, + "learning_rate": 9.94e-07, + "loss": 0.0341, + "reward": 1.3125, + "reward_std": 0.5876962244510651, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completion_length": 17.4375, + "epoch": 0.012133333333333333, + "grad_norm": 15.963037490844727, + "kl": 0.91015625, + "learning_rate": 9.939333333333333e-07, + "loss": 0.0364, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 91 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.375, + "epoch": 0.012266666666666667, + "grad_norm": 16.94063949584961, + "kl": 0.8056640625, + "learning_rate": 9.938666666666667e-07, + "loss": 0.0323, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completion_length": 24.9375, + "epoch": 0.0124, + "grad_norm": 24.466646194458008, + "kl": 0.794921875, + "learning_rate": 9.938e-07, + "loss": 0.0318, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 93 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.3125, + "epoch": 0.012533333333333334, + "grad_norm": 12.89165210723877, + "kl": 0.3642578125, + "learning_rate": 9.937333333333333e-07, + "loss": 0.0146, + "reward": 1.3125, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 94 + }, + { + "clip_ratio": 0.0, + "completion_length": 53.0625, + "epoch": 0.012666666666666666, + "grad_norm": 13.110754013061523, + "kl": 0.376953125, + "learning_rate": 9.936666666666667e-07, + "loss": 0.015, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.25, + "epoch": 0.0128, + "grad_norm": 27.841909408569336, + "kl": 0.3349609375, + "learning_rate": 9.936e-07, + "loss": 0.0134, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.5625, + "epoch": 0.012933333333333333, + "grad_norm": 19.15577507019043, + "kl": 0.3271484375, + "learning_rate": 9.935333333333333e-07, + "loss": 0.0131, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 97 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.0625, + "epoch": 0.013066666666666667, + "grad_norm": 15.135262489318848, + "kl": 0.3759765625, + "learning_rate": 9.934666666666667e-07, + "loss": 0.015, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 98 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.6875, + "epoch": 0.0132, + "grad_norm": 12.982109069824219, + "kl": 0.443359375, + "learning_rate": 9.933999999999999e-07, + "loss": 0.0177, + "reward": 1.0625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 99 + }, + { + "clip_ratio": 0.0, + "completion_length": 48.0, + "epoch": 0.013333333333333334, + "grad_norm": 33.83216857910156, + "kl": 0.541015625, + "learning_rate": 9.933333333333333e-07, + "loss": 0.0216, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.3125, + "epoch": 0.013466666666666667, + "grad_norm": 11.620631217956543, + "kl": 0.474609375, + "learning_rate": 9.932666666666667e-07, + "loss": 0.0189, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 101 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.4375, + "epoch": 0.0136, + "grad_norm": 14.269065856933594, + "kl": 0.35546875, + "learning_rate": 9.931999999999999e-07, + "loss": 0.0142, + "reward": 1.3125, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 102 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.875, + "epoch": 0.013733333333333334, + "grad_norm": 14.084507942199707, + "kl": 0.55419921875, + "learning_rate": 9.931333333333333e-07, + "loss": 0.0222, + "reward": 1.1875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 103 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.8125, + "epoch": 0.013866666666666666, + "grad_norm": 52.209171295166016, + "kl": 0.3974609375, + "learning_rate": 9.930666666666667e-07, + "loss": 0.0159, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.875, + "epoch": 0.014, + "grad_norm": 13.03640365600586, + "kl": 0.3134765625, + "learning_rate": 9.929999999999999e-07, + "loss": 0.0125, + "reward": 1.25, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.625, + "epoch": 0.014133333333333333, + "grad_norm": 14.031671524047852, + "kl": 0.359375, + "learning_rate": 9.929333333333333e-07, + "loss": 0.0144, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 106 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.75, + "epoch": 0.014266666666666667, + "grad_norm": 15.46386432647705, + "kl": 0.35546875, + "learning_rate": 9.928666666666667e-07, + "loss": 0.0142, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 107 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.4375, + "epoch": 0.0144, + "grad_norm": 12.01028060913086, + "kl": 0.35107421875, + "learning_rate": 9.928e-07, + "loss": 0.014, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.4375, + "epoch": 0.014533333333333334, + "grad_norm": 20.153623580932617, + "kl": 0.5234375, + "learning_rate": 9.927333333333332e-07, + "loss": 0.0209, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 109 + }, + { + "clip_ratio": 0.0, + "completion_length": 16.625, + "epoch": 0.014666666666666666, + "grad_norm": 22.60860824584961, + "kl": 0.875, + "learning_rate": 9.926666666666666e-07, + "loss": 0.035, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.5625, + "epoch": 0.0148, + "grad_norm": 13.447626113891602, + "kl": 0.53125, + "learning_rate": 9.926e-07, + "loss": 0.0212, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 111 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.625, + "epoch": 0.014933333333333333, + "grad_norm": 31.64364242553711, + "kl": 0.603515625, + "learning_rate": 9.925333333333334e-07, + "loss": 0.0242, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completion_length": 43.6875, + "epoch": 0.015066666666666667, + "grad_norm": 17.9251651763916, + "kl": 0.962890625, + "learning_rate": 9.924666666666666e-07, + "loss": 0.0385, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 113 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.875, + "epoch": 0.0152, + "grad_norm": 18.66162872314453, + "kl": 0.935546875, + "learning_rate": 9.923999999999998e-07, + "loss": 0.0374, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 114 + }, + { + "clip_ratio": 0.0, + "completion_length": 52.8125, + "epoch": 0.015333333333333332, + "grad_norm": 2.4247355461120605, + "kl": 0.5986328125, + "learning_rate": 9.923333333333332e-07, + "loss": 0.024, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.4375, + "epoch": 0.015466666666666667, + "grad_norm": 22.847930908203125, + "kl": 0.6982421875, + "learning_rate": 9.922666666666666e-07, + "loss": 0.0279, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.875, + "epoch": 0.0156, + "grad_norm": 6.191936016082764, + "kl": 0.6376953125, + "learning_rate": 9.922e-07, + "loss": 0.0255, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 117 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.0625, + "epoch": 0.015733333333333332, + "grad_norm": 12.495172500610352, + "kl": 0.36474609375, + "learning_rate": 9.921333333333332e-07, + "loss": 0.0146, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 118 + }, + { + "clip_ratio": 0.0, + "completion_length": 28.4375, + "epoch": 0.015866666666666668, + "grad_norm": 14.289868354797363, + "kl": 0.69921875, + "learning_rate": 9.920666666666666e-07, + "loss": 0.0279, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 119 + }, + { + "clip_ratio": 0.0, + "completion_length": 25.25, + "epoch": 0.016, + "grad_norm": 17.803186416625977, + "kl": 1.02734375, + "learning_rate": 9.92e-07, + "loss": 0.041, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completion_length": 19.875, + "epoch": 0.016133333333333333, + "grad_norm": 22.51691436767578, + "kl": 0.697265625, + "learning_rate": 9.919333333333334e-07, + "loss": 0.0279, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 121 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.75, + "epoch": 0.016266666666666665, + "grad_norm": 10.507615089416504, + "kl": 0.6328125, + "learning_rate": 9.918666666666666e-07, + "loss": 0.0254, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 122 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.375, + "epoch": 0.0164, + "grad_norm": 12.280542373657227, + "kl": 0.44921875, + "learning_rate": 9.918e-07, + "loss": 0.018, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 123 + }, + { + "clip_ratio": 0.0, + "completion_length": 16.9375, + "epoch": 0.016533333333333334, + "grad_norm": 238.63795471191406, + "kl": 0.974609375, + "learning_rate": 9.917333333333334e-07, + "loss": 0.0388, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.625, + "epoch": 0.016666666666666666, + "grad_norm": 18.929832458496094, + "kl": 0.69921875, + "learning_rate": 9.916666666666666e-07, + "loss": 0.028, + "reward": 1.3125, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completion_length": 48.75, + "epoch": 0.0168, + "grad_norm": 17.464506149291992, + "kl": 0.83984375, + "learning_rate": 9.916e-07, + "loss": 0.0336, + "reward": 1.125, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 126 + }, + { + "clip_ratio": 0.0, + "completion_length": 48.0625, + "epoch": 0.016933333333333335, + "grad_norm": 14.615348815917969, + "kl": 0.62109375, + "learning_rate": 9.915333333333334e-07, + "loss": 0.0249, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 127 + }, + { + "clip_ratio": 0.0, + "completion_length": 28.625, + "epoch": 0.017066666666666667, + "grad_norm": 22.6370906829834, + "kl": 0.853515625, + "learning_rate": 9.914666666666668e-07, + "loss": 0.0341, + "reward": 1.1875, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completion_length": 26.625, + "epoch": 0.0172, + "grad_norm": 13.0133638381958, + "kl": 0.82421875, + "learning_rate": 9.914e-07, + "loss": 0.033, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 129 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.3125, + "epoch": 0.017333333333333333, + "grad_norm": 35.432220458984375, + "kl": 0.5322265625, + "learning_rate": 9.913333333333333e-07, + "loss": 0.0213, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completion_length": 20.8125, + "epoch": 0.017466666666666665, + "grad_norm": 17.622594833374023, + "kl": 0.849609375, + "learning_rate": 9.912666666666665e-07, + "loss": 0.0339, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 131 + }, + { + "clip_ratio": 0.0, + "completion_length": 28.375, + "epoch": 0.0176, + "grad_norm": 148.60223388671875, + "kl": 0.603515625, + "learning_rate": 9.912e-07, + "loss": 0.0241, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completion_length": 24.375, + "epoch": 0.017733333333333334, + "grad_norm": 23.107255935668945, + "kl": 0.72265625, + "learning_rate": 9.911333333333333e-07, + "loss": 0.0289, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 133 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.8125, + "epoch": 0.017866666666666666, + "grad_norm": 7.2212724685668945, + "kl": 0.6455078125, + "learning_rate": 9.910666666666665e-07, + "loss": 0.0259, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 134 + }, + { + "clip_ratio": 0.0, + "completion_length": 44.9375, + "epoch": 0.018, + "grad_norm": 15.971834182739258, + "kl": 0.8046875, + "learning_rate": 9.91e-07, + "loss": 0.0322, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completion_length": 24.3125, + "epoch": 0.018133333333333335, + "grad_norm": 13.13696575164795, + "kl": 0.68359375, + "learning_rate": 9.909333333333333e-07, + "loss": 0.0273, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.375, + "epoch": 0.018266666666666667, + "grad_norm": 17.271879196166992, + "kl": 0.845703125, + "learning_rate": 9.908666666666667e-07, + "loss": 0.0337, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 137 + }, + { + "clip_ratio": 0.0, + "completion_length": 46.125, + "epoch": 0.0184, + "grad_norm": 17.996219635009766, + "kl": 0.5966796875, + "learning_rate": 9.908e-07, + "loss": 0.0239, + "reward": 1.3125, + "reward_std": 0.5876962244510651, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 138 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.9375, + "epoch": 0.018533333333333332, + "grad_norm": 12.226297378540039, + "kl": 0.4921875, + "learning_rate": 9.907333333333333e-07, + "loss": 0.0197, + "reward": 1.3125, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 139 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.3125, + "epoch": 0.018666666666666668, + "grad_norm": 27.36733055114746, + "kl": 0.69921875, + "learning_rate": 9.906666666666667e-07, + "loss": 0.0279, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completion_length": 29.875, + "epoch": 0.0188, + "grad_norm": 16.278030395507812, + "kl": 0.78125, + "learning_rate": 9.906e-07, + "loss": 0.0312, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 141 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.4375, + "epoch": 0.018933333333333333, + "grad_norm": 17.587865829467773, + "kl": 0.275390625, + "learning_rate": 9.905333333333333e-07, + "loss": 0.011, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 142 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.019066666666666666, + "grad_norm": 18.78192901611328, + "kl": 0.943359375, + "learning_rate": 9.904666666666667e-07, + "loss": 0.0377, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 143 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.75, + "epoch": 0.0192, + "grad_norm": 18.528474807739258, + "kl": 0.5703125, + "learning_rate": 9.903999999999999e-07, + "loss": 0.0228, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.019333333333333334, + "grad_norm": 22.421634674072266, + "kl": 0.6376953125, + "learning_rate": 9.903333333333333e-07, + "loss": 0.0255, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.3125, + "epoch": 0.019466666666666667, + "grad_norm": 12.865647315979004, + "kl": 0.525390625, + "learning_rate": 9.902666666666667e-07, + "loss": 0.0211, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 146 + }, + { + "clip_ratio": 0.0, + "completion_length": 25.0625, + "epoch": 0.0196, + "grad_norm": 13.61320686340332, + "kl": 0.681640625, + "learning_rate": 9.901999999999999e-07, + "loss": 0.0273, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 147 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.8125, + "epoch": 0.019733333333333332, + "grad_norm": 15.05859088897705, + "kl": 1.0234375, + "learning_rate": 9.901333333333333e-07, + "loss": 0.041, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completion_length": 31.0625, + "epoch": 0.019866666666666668, + "grad_norm": 19.793874740600586, + "kl": 0.90625, + "learning_rate": 9.900666666666667e-07, + "loss": 0.0362, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 149 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.9375, + "epoch": 0.02, + "grad_norm": 12.184428215026855, + "kl": 0.876953125, + "learning_rate": 9.9e-07, + "loss": 0.0351, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.125, + "epoch": 0.020133333333333333, + "grad_norm": 18.433015823364258, + "kl": 0.80859375, + "learning_rate": 9.899333333333332e-07, + "loss": 0.0324, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 151 + }, + { + "clip_ratio": 0.0, + "completion_length": 43.3125, + "epoch": 0.020266666666666665, + "grad_norm": 12.253523826599121, + "kl": 0.7421875, + "learning_rate": 9.898666666666666e-07, + "loss": 0.0298, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completion_length": 46.875, + "epoch": 0.0204, + "grad_norm": 20.6929931640625, + "kl": 0.5595703125, + "learning_rate": 9.898e-07, + "loss": 0.0224, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 153 + }, + { + "clip_ratio": 0.0, + "completion_length": 22.9375, + "epoch": 0.020533333333333334, + "grad_norm": 13.055486679077148, + "kl": 0.7490234375, + "learning_rate": 9.897333333333332e-07, + "loss": 0.03, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 154 + }, + { + "clip_ratio": 0.0, + "completion_length": 48.75, + "epoch": 0.020666666666666667, + "grad_norm": 15.21764087677002, + "kl": 0.5244140625, + "learning_rate": 9.896666666666666e-07, + "loss": 0.0209, + "reward": 1.1875, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completion_length": 43.125, + "epoch": 0.0208, + "grad_norm": 24.90786361694336, + "kl": 0.697265625, + "learning_rate": 9.896e-07, + "loss": 0.028, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completion_length": 24.4375, + "epoch": 0.020933333333333335, + "grad_norm": 41.89891052246094, + "kl": 0.767578125, + "learning_rate": 9.895333333333334e-07, + "loss": 0.0307, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 157 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.5625, + "epoch": 0.021066666666666668, + "grad_norm": 13.578216552734375, + "kl": 0.8125, + "learning_rate": 9.894666666666666e-07, + "loss": 0.0325, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 158 + }, + { + "clip_ratio": 0.0, + "completion_length": 19.1875, + "epoch": 0.0212, + "grad_norm": 38.06028747558594, + "kl": 1.26953125, + "learning_rate": 9.894e-07, + "loss": 0.0507, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 159 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.1875, + "epoch": 0.021333333333333333, + "grad_norm": 54.9559440612793, + "kl": 0.55078125, + "learning_rate": 9.893333333333332e-07, + "loss": 0.022, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.1875, + "epoch": 0.021466666666666665, + "grad_norm": 14.98383903503418, + "kl": 0.931640625, + "learning_rate": 9.892666666666666e-07, + "loss": 0.0373, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 161 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.25, + "epoch": 0.0216, + "grad_norm": 22.747777938842773, + "kl": 1.12890625, + "learning_rate": 9.892e-07, + "loss": 0.0452, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 162 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.3125, + "epoch": 0.021733333333333334, + "grad_norm": 19.63970947265625, + "kl": 0.69140625, + "learning_rate": 9.891333333333332e-07, + "loss": 0.0276, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 163 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.1875, + "epoch": 0.021866666666666666, + "grad_norm": 30.969717025756836, + "kl": 0.80859375, + "learning_rate": 9.890666666666666e-07, + "loss": 0.0324, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.4375, + "epoch": 0.022, + "grad_norm": 32.15286636352539, + "kl": 1.0703125, + "learning_rate": 9.89e-07, + "loss": 0.0428, + "reward": 1.375, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completion_length": 43.625, + "epoch": 0.022133333333333335, + "grad_norm": 27.347640991210938, + "kl": 0.705078125, + "learning_rate": 9.889333333333334e-07, + "loss": 0.0282, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 166 + }, + { + "clip_ratio": 0.0, + "completion_length": 52.8125, + "epoch": 0.022266666666666667, + "grad_norm": 20.4211483001709, + "kl": 0.966796875, + "learning_rate": 9.888666666666666e-07, + "loss": 0.0387, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 167 + }, + { + "clip_ratio": 0.0, + "completion_length": 26.3125, + "epoch": 0.0224, + "grad_norm": 24.80679702758789, + "kl": 0.6328125, + "learning_rate": 9.888e-07, + "loss": 0.0254, + "reward": 1.625, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.0, + "epoch": 0.022533333333333332, + "grad_norm": 15.55086898803711, + "kl": 0.654296875, + "learning_rate": 9.887333333333334e-07, + "loss": 0.0262, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 169 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.6875, + "epoch": 0.02266666666666667, + "grad_norm": 16.75022315979004, + "kl": 0.5234375, + "learning_rate": 9.886666666666665e-07, + "loss": 0.021, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.0, + "epoch": 0.0228, + "grad_norm": 19.413557052612305, + "kl": 0.8388671875, + "learning_rate": 9.886e-07, + "loss": 0.0335, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 171 + }, + { + "clip_ratio": 0.0, + "completion_length": 24.625, + "epoch": 0.022933333333333333, + "grad_norm": 27.413616180419922, + "kl": 1.21484375, + "learning_rate": 9.885333333333333e-07, + "loss": 0.0486, + "reward": 1.5625, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completion_length": 31.6875, + "epoch": 0.023066666666666666, + "grad_norm": 20.21346664428711, + "kl": 0.658203125, + "learning_rate": 9.884666666666667e-07, + "loss": 0.0264, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 173 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.125, + "epoch": 0.0232, + "grad_norm": 17.35886573791504, + "kl": 0.853515625, + "learning_rate": 9.884e-07, + "loss": 0.0341, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 174 + }, + { + "clip_ratio": 0.0, + "completion_length": 42.9375, + "epoch": 0.023333333333333334, + "grad_norm": 36.11369323730469, + "kl": 1.48046875, + "learning_rate": 9.883333333333333e-07, + "loss": 0.0593, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0625, + "epoch": 0.023466666666666667, + "grad_norm": 108.15668487548828, + "kl": 0.8955078125, + "learning_rate": 9.882666666666665e-07, + "loss": 0.0359, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completion_length": 15.125, + "epoch": 0.0236, + "grad_norm": 16.324493408203125, + "kl": 1.0703125, + "learning_rate": 9.882e-07, + "loss": 0.0428, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 177 + }, + { + "clip_ratio": 0.0, + "completion_length": 8.5625, + "epoch": 0.023733333333333332, + "grad_norm": 22.483062744140625, + "kl": 1.0, + "learning_rate": 9.881333333333333e-07, + "loss": 0.04, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 178 + }, + { + "clip_ratio": 0.0, + "completion_length": 11.0625, + "epoch": 0.023866666666666668, + "grad_norm": 31.09198570251465, + "kl": 1.3359375, + "learning_rate": 9.880666666666665e-07, + "loss": 0.0534, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 179 + }, + { + "clip_ratio": 0.0, + "completion_length": 9.625, + "epoch": 0.024, + "grad_norm": 18.65386199951172, + "kl": 1.017578125, + "learning_rate": 9.88e-07, + "loss": 0.0407, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.4375, + "epoch": 0.024133333333333333, + "grad_norm": 13.64356803894043, + "kl": 0.697265625, + "learning_rate": 9.879333333333333e-07, + "loss": 0.0279, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 181 + }, + { + "clip_ratio": 0.0, + "completion_length": 12.625, + "epoch": 0.024266666666666666, + "grad_norm": 12.369552612304688, + "kl": 0.919921875, + "learning_rate": 9.878666666666667e-07, + "loss": 0.0369, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 182 + }, + { + "clip_ratio": 0.0, + "completion_length": 21.875, + "epoch": 0.0244, + "grad_norm": 6.512619972229004, + "kl": 0.650390625, + "learning_rate": 9.877999999999999e-07, + "loss": 0.026, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 183 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.0, + "epoch": 0.024533333333333334, + "grad_norm": 14.247668266296387, + "kl": 1.0625, + "learning_rate": 9.877333333333333e-07, + "loss": 0.0425, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completion_length": 31.1875, + "epoch": 0.024666666666666667, + "grad_norm": 18.030153274536133, + "kl": 0.6181640625, + "learning_rate": 9.876666666666667e-07, + "loss": 0.0247, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completion_length": 16.5625, + "epoch": 0.0248, + "grad_norm": 10.114640235900879, + "kl": 0.75390625, + "learning_rate": 9.876e-07, + "loss": 0.0302, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 186 + }, + { + "clip_ratio": 0.0, + "completion_length": 21.5625, + "epoch": 0.02493333333333333, + "grad_norm": 25.959131240844727, + "kl": 0.880859375, + "learning_rate": 9.875333333333333e-07, + "loss": 0.0352, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 187 + }, + { + "clip_ratio": 0.0, + "completion_length": 27.5, + "epoch": 0.025066666666666668, + "grad_norm": 1.4535435438156128, + "kl": 0.876953125, + "learning_rate": 9.874666666666667e-07, + "loss": 0.0351, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completion_length": 11.3125, + "epoch": 0.0252, + "grad_norm": 23.176555633544922, + "kl": 1.40234375, + "learning_rate": 9.874e-07, + "loss": 0.056, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 189 + }, + { + "clip_ratio": 0.0, + "completion_length": 19.5625, + "epoch": 0.025333333333333333, + "grad_norm": 18.551677703857422, + "kl": 0.7060546875, + "learning_rate": 9.873333333333333e-07, + "loss": 0.0282, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completion_length": 43.0625, + "epoch": 0.025466666666666665, + "grad_norm": 13.087846755981445, + "kl": 0.662109375, + "learning_rate": 9.872666666666667e-07, + "loss": 0.0266, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 191 + }, + { + "clip_ratio": 0.0, + "completion_length": 24.3125, + "epoch": 0.0256, + "grad_norm": 345.0792236328125, + "kl": 0.576171875, + "learning_rate": 9.871999999999998e-07, + "loss": 0.023, + "reward": 1.4375, + "reward_std": 0.7253239452838898, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completion_length": 21.875, + "epoch": 0.025733333333333334, + "grad_norm": 20.299753189086914, + "kl": 0.859375, + "learning_rate": 9.871333333333332e-07, + "loss": 0.0343, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 193 + }, + { + "clip_ratio": 0.0, + "completion_length": 11.625, + "epoch": 0.025866666666666666, + "grad_norm": 14.396468162536621, + "kl": 0.595703125, + "learning_rate": 9.870666666666666e-07, + "loss": 0.0239, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 194 + }, + { + "clip_ratio": 0.0, + "completion_length": 29.75, + "epoch": 0.026, + "grad_norm": 35.1627082824707, + "kl": 1.04296875, + "learning_rate": 9.87e-07, + "loss": 0.0417, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completion_length": 18.0625, + "epoch": 0.026133333333333335, + "grad_norm": 6.835484504699707, + "kl": 0.6162109375, + "learning_rate": 9.869333333333332e-07, + "loss": 0.0246, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completion_length": 18.8125, + "epoch": 0.026266666666666667, + "grad_norm": 3.192700147628784, + "kl": 0.76171875, + "learning_rate": 9.868666666666666e-07, + "loss": 0.0304, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 197 + }, + { + "clip_ratio": 0.0, + "completion_length": 22.125, + "epoch": 0.0264, + "grad_norm": 14.991588592529297, + "kl": 1.013671875, + "learning_rate": 9.868e-07, + "loss": 0.0406, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 198 + }, + { + "clip_ratio": 0.0, + "completion_length": 19.875, + "epoch": 0.026533333333333332, + "grad_norm": 20.243270874023438, + "kl": 1.009765625, + "learning_rate": 9.867333333333332e-07, + "loss": 0.0404, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 199 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.1875, + "epoch": 0.02666666666666667, + "grad_norm": 22.62047576904297, + "kl": 0.4921875, + "learning_rate": 9.866666666666666e-07, + "loss": 0.0196, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completion_length": 43.875, + "epoch": 0.0268, + "grad_norm": 129.14126586914062, + "kl": 0.474609375, + "learning_rate": 9.866e-07, + "loss": 0.019, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 201 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.3125, + "epoch": 0.026933333333333333, + "grad_norm": 7.954946041107178, + "kl": 0.69140625, + "learning_rate": 9.865333333333334e-07, + "loss": 0.0276, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 202 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.0, + "epoch": 0.027066666666666666, + "grad_norm": 19.615711212158203, + "kl": 0.7998046875, + "learning_rate": 9.864666666666666e-07, + "loss": 0.0319, + "reward": 1.4375, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 203 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.875, + "epoch": 0.0272, + "grad_norm": 17.742704391479492, + "kl": 0.9765625, + "learning_rate": 9.864e-07, + "loss": 0.0391, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 0.027333333333333334, + "grad_norm": 24.377161026000977, + "kl": 1.025390625, + "learning_rate": 9.863333333333332e-07, + "loss": 0.0411, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 205 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.0, + "epoch": 0.027466666666666667, + "grad_norm": 27.315765380859375, + "kl": 0.6640625, + "learning_rate": 9.862666666666666e-07, + "loss": 0.0265, + "reward": 1.25, + "reward_std": 0.6924468874931335, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 206 + }, + { + "clip_ratio": 0.0, + "completion_length": 52.4375, + "epoch": 0.0276, + "grad_norm": 12.837471961975098, + "kl": 0.5146484375, + "learning_rate": 9.862e-07, + "loss": 0.0206, + "reward": 1.5625, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 207 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.3125, + "epoch": 0.027733333333333332, + "grad_norm": 9.939675331115723, + "kl": 0.6123046875, + "learning_rate": 9.861333333333332e-07, + "loss": 0.0245, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 208 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.25, + "epoch": 0.027866666666666668, + "grad_norm": 18.40255355834961, + "kl": 0.990234375, + "learning_rate": 9.860666666666666e-07, + "loss": 0.0396, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 209 + }, + { + "clip_ratio": 0.0, + "completion_length": 52.5625, + "epoch": 0.028, + "grad_norm": 62.61850357055664, + "kl": 0.4892578125, + "learning_rate": 9.86e-07, + "loss": 0.0196, + "reward": 1.4375, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.375, + "epoch": 0.028133333333333333, + "grad_norm": 16.80623435974121, + "kl": 0.46484375, + "learning_rate": 9.859333333333334e-07, + "loss": 0.0187, + "reward": 1.4375, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 211 + }, + { + "clip_ratio": 0.0, + "completion_length": 42.3125, + "epoch": 0.028266666666666666, + "grad_norm": 24.76730728149414, + "kl": 0.546875, + "learning_rate": 9.858666666666665e-07, + "loss": 0.0219, + "reward": 1.25, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 212 + }, + { + "clip_ratio": 0.0, + "completion_length": 52.8125, + "epoch": 0.0284, + "grad_norm": 21.657642364501953, + "kl": 0.73046875, + "learning_rate": 9.858e-07, + "loss": 0.0292, + "reward": 1.4375, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 213 + }, + { + "clip_ratio": 0.0, + "completion_length": 32.4375, + "epoch": 0.028533333333333334, + "grad_norm": 16.732641220092773, + "kl": 0.447265625, + "learning_rate": 9.857333333333333e-07, + "loss": 0.0179, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 214 + }, + { + "clip_ratio": 0.0, + "completion_length": 32.5, + "epoch": 0.028666666666666667, + "grad_norm": 17.12205696105957, + "kl": 0.482421875, + "learning_rate": 9.856666666666667e-07, + "loss": 0.0193, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 215 + }, + { + "clip_ratio": 0.0, + "completion_length": 26.75, + "epoch": 0.0288, + "grad_norm": 28.225032806396484, + "kl": 0.625, + "learning_rate": 9.856e-07, + "loss": 0.025, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 216 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.75, + "epoch": 0.028933333333333332, + "grad_norm": 27.516183853149414, + "kl": 0.662109375, + "learning_rate": 9.855333333333333e-07, + "loss": 0.0265, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 217 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.9375, + "epoch": 0.029066666666666668, + "grad_norm": 34.51641845703125, + "kl": 0.5791015625, + "learning_rate": 9.854666666666667e-07, + "loss": 0.0232, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 218 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.3125, + "epoch": 0.0292, + "grad_norm": 7.49312686920166, + "kl": 0.6591796875, + "learning_rate": 9.854e-07, + "loss": 0.0264, + "reward": 1.0625, + "reward_std": 0.4172614812850952, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.875, + "step": 219 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.625, + "epoch": 0.029333333333333333, + "grad_norm": 26.38434410095215, + "kl": 0.873046875, + "learning_rate": 9.853333333333333e-07, + "loss": 0.0349, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completion_length": 17.9375, + "epoch": 0.029466666666666665, + "grad_norm": 19.001075744628906, + "kl": 0.791015625, + "learning_rate": 9.852666666666665e-07, + "loss": 0.0317, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 221 + }, + { + "clip_ratio": 0.0, + "completion_length": 16.0, + "epoch": 0.0296, + "grad_norm": 47.197654724121094, + "kl": 1.01953125, + "learning_rate": 9.852e-07, + "loss": 0.0407, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 222 + }, + { + "clip_ratio": 0.0, + "completion_length": 15.5, + "epoch": 0.029733333333333334, + "grad_norm": 20.01003074645996, + "kl": 1.12109375, + "learning_rate": 9.851333333333333e-07, + "loss": 0.045, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 223 + }, + { + "clip_ratio": 0.0, + "completion_length": 12.0625, + "epoch": 0.029866666666666666, + "grad_norm": 28.494436264038086, + "kl": 1.0078125, + "learning_rate": 9.850666666666667e-07, + "loss": 0.0403, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 224 + }, + { + "clip_ratio": 0.0, + "completion_length": 19.125, + "epoch": 0.03, + "grad_norm": 12.490227699279785, + "kl": 0.806640625, + "learning_rate": 9.849999999999999e-07, + "loss": 0.0322, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 225 + }, + { + "clip_ratio": 0.0, + "completion_length": 20.75, + "epoch": 0.030133333333333335, + "grad_norm": 32.93749237060547, + "kl": 1.203125, + "learning_rate": 9.849333333333333e-07, + "loss": 0.0482, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 226 + }, + { + "clip_ratio": 0.0, + "completion_length": 17.8125, + "epoch": 0.030266666666666667, + "grad_norm": 11.514131546020508, + "kl": 0.728515625, + "learning_rate": 9.848666666666667e-07, + "loss": 0.0292, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 227 + }, + { + "clip_ratio": 0.0, + "completion_length": 19.75, + "epoch": 0.0304, + "grad_norm": 16.009794235229492, + "kl": 1.44140625, + "learning_rate": 9.847999999999999e-07, + "loss": 0.0577, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 228 + }, + { + "clip_ratio": 0.0, + "completion_length": 15.4375, + "epoch": 0.030533333333333332, + "grad_norm": 8.068607330322266, + "kl": 0.607421875, + "learning_rate": 9.847333333333333e-07, + "loss": 0.0242, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 229 + }, + { + "clip_ratio": 0.0, + "completion_length": 17.3125, + "epoch": 0.030666666666666665, + "grad_norm": 21.535703659057617, + "kl": 0.728515625, + "learning_rate": 9.846666666666667e-07, + "loss": 0.0291, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completion_length": 17.0, + "epoch": 0.0308, + "grad_norm": 21.015825271606445, + "kl": 0.607421875, + "learning_rate": 9.846e-07, + "loss": 0.0242, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 231 + }, + { + "clip_ratio": 0.0, + "completion_length": 31.8125, + "epoch": 0.030933333333333334, + "grad_norm": 26.39970588684082, + "kl": 0.876953125, + "learning_rate": 9.845333333333333e-07, + "loss": 0.035, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 232 + }, + { + "clip_ratio": 0.0, + "completion_length": 12.125, + "epoch": 0.031066666666666666, + "grad_norm": 21.466392517089844, + "kl": 0.80859375, + "learning_rate": 9.844666666666667e-07, + "loss": 0.0324, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 233 + }, + { + "clip_ratio": 0.0, + "completion_length": 13.75, + "epoch": 0.0312, + "grad_norm": 19.049327850341797, + "kl": 0.78125, + "learning_rate": 9.844e-07, + "loss": 0.0313, + "reward": 1.0, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 234 + }, + { + "clip_ratio": 0.0, + "completion_length": 18.0625, + "epoch": 0.03133333333333333, + "grad_norm": 16.00638771057129, + "kl": 0.8671875, + "learning_rate": 9.843333333333332e-07, + "loss": 0.0347, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 235 + }, + { + "clip_ratio": 0.0, + "completion_length": 25.75, + "epoch": 0.031466666666666664, + "grad_norm": 17.287046432495117, + "kl": 0.814453125, + "learning_rate": 9.842666666666666e-07, + "loss": 0.0326, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 236 + }, + { + "clip_ratio": 0.0, + "completion_length": 27.0, + "epoch": 0.0316, + "grad_norm": 13.796801567077637, + "kl": 1.3359375, + "learning_rate": 9.841999999999998e-07, + "loss": 0.0536, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 237 + }, + { + "clip_ratio": 0.0, + "completion_length": 24.5625, + "epoch": 0.031733333333333336, + "grad_norm": 18.133344650268555, + "kl": 0.857421875, + "learning_rate": 9.841333333333332e-07, + "loss": 0.0342, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 238 + }, + { + "clip_ratio": 0.0, + "completion_length": 17.8125, + "epoch": 0.03186666666666667, + "grad_norm": 38.46413803100586, + "kl": 0.96484375, + "learning_rate": 9.840666666666666e-07, + "loss": 0.0385, + "reward": 1.375, + "reward_std": 0.6943650841712952, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 239 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.8125, + "epoch": 0.032, + "grad_norm": 17.039064407348633, + "kl": 0.7421875, + "learning_rate": 9.84e-07, + "loss": 0.0297, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.375, + "epoch": 0.03213333333333333, + "grad_norm": 10.95724105834961, + "kl": 0.859375, + "learning_rate": 9.839333333333332e-07, + "loss": 0.0344, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 241 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.625, + "epoch": 0.032266666666666666, + "grad_norm": 14.58068561553955, + "kl": 0.58984375, + "learning_rate": 9.838666666666666e-07, + "loss": 0.0236, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 242 + }, + { + "clip_ratio": 0.0, + "completion_length": 28.0, + "epoch": 0.0324, + "grad_norm": 21.77033042907715, + "kl": 0.806640625, + "learning_rate": 9.838e-07, + "loss": 0.0322, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 243 + }, + { + "clip_ratio": 0.0, + "completion_length": 15.3125, + "epoch": 0.03253333333333333, + "grad_norm": 30.797826766967773, + "kl": 1.138671875, + "learning_rate": 9.837333333333334e-07, + "loss": 0.0457, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 244 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.8125, + "epoch": 0.03266666666666666, + "grad_norm": 16.987323760986328, + "kl": 0.5703125, + "learning_rate": 9.836666666666666e-07, + "loss": 0.0228, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 245 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.75, + "epoch": 0.0328, + "grad_norm": 13.19991683959961, + "kl": 0.552734375, + "learning_rate": 9.836e-07, + "loss": 0.022, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 246 + }, + { + "clip_ratio": 0.0, + "completion_length": 21.125, + "epoch": 0.032933333333333335, + "grad_norm": 22.497196197509766, + "kl": 0.685546875, + "learning_rate": 9.835333333333334e-07, + "loss": 0.0274, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 247 + }, + { + "clip_ratio": 0.0, + "completion_length": 32.6875, + "epoch": 0.03306666666666667, + "grad_norm": 19.850326538085938, + "kl": 0.810546875, + "learning_rate": 9.834666666666666e-07, + "loss": 0.0324, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 248 + }, + { + "clip_ratio": 0.0, + "completion_length": 17.1875, + "epoch": 0.0332, + "grad_norm": 26.813716888427734, + "kl": 1.095703125, + "learning_rate": 9.834e-07, + "loss": 0.0436, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 249 + }, + { + "clip_ratio": 0.0, + "completion_length": 25.75, + "epoch": 0.03333333333333333, + "grad_norm": 22.231048583984375, + "kl": 0.58984375, + "learning_rate": 9.833333333333332e-07, + "loss": 0.0236, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completion_length": 23.0, + "epoch": 0.033466666666666665, + "grad_norm": 22.144176483154297, + "kl": 0.953125, + "learning_rate": 9.832666666666666e-07, + "loss": 0.0381, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 251 + }, + { + "clip_ratio": 0.0, + "completion_length": 28.3125, + "epoch": 0.0336, + "grad_norm": 12.343694686889648, + "kl": 0.646484375, + "learning_rate": 9.832e-07, + "loss": 0.0259, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 252 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.5625, + "epoch": 0.03373333333333333, + "grad_norm": 16.42769432067871, + "kl": 0.650390625, + "learning_rate": 9.831333333333334e-07, + "loss": 0.026, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 253 + }, + { + "clip_ratio": 0.0, + "completion_length": 28.0625, + "epoch": 0.03386666666666667, + "grad_norm": 22.4140682220459, + "kl": 0.837890625, + "learning_rate": 9.830666666666665e-07, + "loss": 0.0335, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 254 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.125, + "epoch": 0.034, + "grad_norm": 9.399197578430176, + "kl": 0.4765625, + "learning_rate": 9.83e-07, + "loss": 0.019, + "reward": 1.0625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 255 + }, + { + "clip_ratio": 0.0, + "completion_length": 31.625, + "epoch": 0.034133333333333335, + "grad_norm": 25.336023330688477, + "kl": 0.59765625, + "learning_rate": 9.829333333333333e-07, + "loss": 0.0239, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 256 + }, + { + "clip_ratio": 0.0, + "completion_length": 48.625, + "epoch": 0.03426666666666667, + "grad_norm": 19.239116668701172, + "kl": 0.61083984375, + "learning_rate": 9.828666666666665e-07, + "loss": 0.0245, + "reward": 1.375, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 257 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.1875, + "epoch": 0.0344, + "grad_norm": 21.070823669433594, + "kl": 0.8671875, + "learning_rate": 9.828e-07, + "loss": 0.0347, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 258 + }, + { + "clip_ratio": 0.0, + "completion_length": 19.5625, + "epoch": 0.03453333333333333, + "grad_norm": 29.17822265625, + "kl": 0.6201171875, + "learning_rate": 9.827333333333333e-07, + "loss": 0.0248, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.875, + "step": 259 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.875, + "epoch": 0.034666666666666665, + "grad_norm": 6.3060407638549805, + "kl": 0.478515625, + "learning_rate": 9.826666666666667e-07, + "loss": 0.0191, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.5625, + "epoch": 0.0348, + "grad_norm": 14.557801246643066, + "kl": 0.5458984375, + "learning_rate": 9.826e-07, + "loss": 0.0218, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 261 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.6875, + "epoch": 0.03493333333333333, + "grad_norm": 15.59456729888916, + "kl": 0.54296875, + "learning_rate": 9.825333333333333e-07, + "loss": 0.0217, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 262 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.0625, + "epoch": 0.03506666666666667, + "grad_norm": 5.2625956535339355, + "kl": 0.29931640625, + "learning_rate": 9.824666666666667e-07, + "loss": 0.0119, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 263 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.8125, + "epoch": 0.0352, + "grad_norm": 36.987342834472656, + "kl": 0.5556640625, + "learning_rate": 9.824e-07, + "loss": 0.0222, + "reward": 1.25, + "reward_std": 0.579209566116333, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 264 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.1875, + "epoch": 0.035333333333333335, + "grad_norm": 12.845407485961914, + "kl": 0.556640625, + "learning_rate": 9.823333333333333e-07, + "loss": 0.0223, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 265 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.9375, + "epoch": 0.03546666666666667, + "grad_norm": 15.359212875366211, + "kl": 0.4912109375, + "learning_rate": 9.822666666666665e-07, + "loss": 0.0196, + "reward": 1.4375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 266 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.5, + "epoch": 0.0356, + "grad_norm": 21.93207359313965, + "kl": 0.5341796875, + "learning_rate": 9.821999999999999e-07, + "loss": 0.0214, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 267 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.5, + "epoch": 0.03573333333333333, + "grad_norm": 19.36957550048828, + "kl": 0.5126953125, + "learning_rate": 9.821333333333333e-07, + "loss": 0.0204, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 268 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.3125, + "epoch": 0.035866666666666665, + "grad_norm": 17.08452606201172, + "kl": 0.4296875, + "learning_rate": 9.820666666666667e-07, + "loss": 0.0172, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 269 + }, + { + "clip_ratio": 0.0, + "completion_length": 25.5, + "epoch": 0.036, + "grad_norm": 22.985898971557617, + "kl": 0.6142578125, + "learning_rate": 9.819999999999999e-07, + "loss": 0.0246, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 270 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.625, + "epoch": 0.03613333333333334, + "grad_norm": 12.593403816223145, + "kl": 0.486328125, + "learning_rate": 9.819333333333333e-07, + "loss": 0.0195, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 271 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.25, + "epoch": 0.03626666666666667, + "grad_norm": 18.34018325805664, + "kl": 0.4970703125, + "learning_rate": 9.818666666666667e-07, + "loss": 0.0199, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 272 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.375, + "epoch": 0.0364, + "grad_norm": 17.0610408782959, + "kl": 0.3896484375, + "learning_rate": 9.817999999999999e-07, + "loss": 0.0156, + "reward": 1.5, + "reward_std": 0.6452257037162781, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 273 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.0, + "epoch": 0.036533333333333334, + "grad_norm": 16.06097412109375, + "kl": 0.404296875, + "learning_rate": 9.817333333333333e-07, + "loss": 0.0162, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 274 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.8125, + "epoch": 0.03666666666666667, + "grad_norm": 15.046359062194824, + "kl": 0.55859375, + "learning_rate": 9.816666666666667e-07, + "loss": 0.0224, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 275 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.0625, + "epoch": 0.0368, + "grad_norm": 1.1510478258132935, + "kl": 0.6708984375, + "learning_rate": 9.816e-07, + "loss": 0.0268, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 276 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.25, + "epoch": 0.03693333333333333, + "grad_norm": 14.608692169189453, + "kl": 0.3564453125, + "learning_rate": 9.815333333333332e-07, + "loss": 0.0143, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 277 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5625, + "epoch": 0.037066666666666664, + "grad_norm": 14.41381549835205, + "kl": 0.55078125, + "learning_rate": 9.814666666666666e-07, + "loss": 0.022, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 278 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.0, + "epoch": 0.0372, + "grad_norm": 83.32125854492188, + "kl": 0.3388671875, + "learning_rate": 9.814e-07, + "loss": 0.0136, + "reward": 1.25, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 279 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.5625, + "epoch": 0.037333333333333336, + "grad_norm": 18.038288116455078, + "kl": 0.4296875, + "learning_rate": 9.813333333333332e-07, + "loss": 0.0172, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.0, + "epoch": 0.03746666666666667, + "grad_norm": 14.569624900817871, + "kl": 0.24267578125, + "learning_rate": 9.812666666666666e-07, + "loss": 0.0097, + "reward": 1.375, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 281 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.8125, + "epoch": 0.0376, + "grad_norm": 14.079543113708496, + "kl": 0.4404296875, + "learning_rate": 9.811999999999998e-07, + "loss": 0.0176, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 282 + }, + { + "clip_ratio": 0.0, + "completion_length": 46.625, + "epoch": 0.037733333333333334, + "grad_norm": 19.957908630371094, + "kl": 0.6796875, + "learning_rate": 9.811333333333332e-07, + "loss": 0.0272, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 283 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.4375, + "epoch": 0.037866666666666667, + "grad_norm": 16.928340911865234, + "kl": 0.494140625, + "learning_rate": 9.810666666666666e-07, + "loss": 0.0198, + "reward": 1.125, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 284 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.4375, + "epoch": 0.038, + "grad_norm": 41.62216567993164, + "kl": 2.146484375, + "learning_rate": 9.81e-07, + "loss": 0.0858, + "reward": 1.0, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 285 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.4375, + "epoch": 0.03813333333333333, + "grad_norm": 17.523632049560547, + "kl": 0.4599609375, + "learning_rate": 9.809333333333332e-07, + "loss": 0.0185, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 286 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.0625, + "epoch": 0.038266666666666664, + "grad_norm": 12.223152160644531, + "kl": 0.3955078125, + "learning_rate": 9.808666666666666e-07, + "loss": 0.0158, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 287 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.9375, + "epoch": 0.0384, + "grad_norm": 9.822270393371582, + "kl": 0.4501953125, + "learning_rate": 9.808e-07, + "loss": 0.018, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 288 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.375, + "epoch": 0.038533333333333336, + "grad_norm": 7.69195032119751, + "kl": 0.3193359375, + "learning_rate": 9.807333333333334e-07, + "loss": 0.0128, + "reward": 1.0, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 289 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.4375, + "epoch": 0.03866666666666667, + "grad_norm": 9.337879180908203, + "kl": 0.20166015625, + "learning_rate": 9.806666666666666e-07, + "loss": 0.0081, + "reward": 1.4375, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 290 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.125, + "epoch": 0.0388, + "grad_norm": 17.7623291015625, + "kl": 0.318359375, + "learning_rate": 9.806e-07, + "loss": 0.0127, + "reward": 1.125, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.875, + "step": 291 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.75, + "epoch": 0.038933333333333334, + "grad_norm": 12.527984619140625, + "kl": 0.44921875, + "learning_rate": 9.805333333333334e-07, + "loss": 0.0179, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 292 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.9375, + "epoch": 0.039066666666666666, + "grad_norm": 11.648524284362793, + "kl": 0.462890625, + "learning_rate": 9.804666666666666e-07, + "loss": 0.0186, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 293 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.125, + "epoch": 0.0392, + "grad_norm": 22.93866729736328, + "kl": 0.306640625, + "learning_rate": 9.804e-07, + "loss": 0.0123, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 294 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.875, + "epoch": 0.03933333333333333, + "grad_norm": 67.07592010498047, + "kl": 0.44921875, + "learning_rate": 9.803333333333332e-07, + "loss": 0.0179, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 295 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.0, + "epoch": 0.039466666666666664, + "grad_norm": 13.058833122253418, + "kl": 0.40625, + "learning_rate": 9.802666666666666e-07, + "loss": 0.0163, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 296 + }, + { + "clip_ratio": 0.0, + "completion_length": 47.375, + "epoch": 0.0396, + "grad_norm": 13.308572769165039, + "kl": 0.6650390625, + "learning_rate": 9.802e-07, + "loss": 0.0266, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 297 + }, + { + "clip_ratio": 0.0, + "completion_length": 53.0625, + "epoch": 0.039733333333333336, + "grad_norm": 11.021879196166992, + "kl": 0.58984375, + "learning_rate": 9.801333333333333e-07, + "loss": 0.0235, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 298 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.5625, + "epoch": 0.03986666666666667, + "grad_norm": 18.73659324645996, + "kl": 0.4345703125, + "learning_rate": 9.800666666666665e-07, + "loss": 0.0174, + "reward": 1.4375, + "reward_std": 0.7499763667583466, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 299 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.4375, + "epoch": 0.04, + "grad_norm": 12.47770881652832, + "kl": 0.693359375, + "learning_rate": 9.8e-07, + "loss": 0.0277, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completion_length": 42.5625, + "epoch": 0.04013333333333333, + "grad_norm": 16.926860809326172, + "kl": 0.65234375, + "learning_rate": 9.799333333333333e-07, + "loss": 0.0261, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 301 + }, + { + "clip_ratio": 0.0, + "completion_length": 44.125, + "epoch": 0.040266666666666666, + "grad_norm": 8.325267791748047, + "kl": 0.443359375, + "learning_rate": 9.798666666666665e-07, + "loss": 0.0177, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 302 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.375, + "epoch": 0.0404, + "grad_norm": 46.01936340332031, + "kl": 0.484375, + "learning_rate": 9.798e-07, + "loss": 0.0193, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 303 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.8125, + "epoch": 0.04053333333333333, + "grad_norm": 11.257341384887695, + "kl": 0.82421875, + "learning_rate": 9.797333333333333e-07, + "loss": 0.033, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 304 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.25, + "epoch": 0.04066666666666666, + "grad_norm": 13.392840385437012, + "kl": 0.7880859375, + "learning_rate": 9.796666666666667e-07, + "loss": 0.0316, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 305 + }, + { + "clip_ratio": 0.0, + "completion_length": 23.125, + "epoch": 0.0408, + "grad_norm": 23.589248657226562, + "kl": 0.583984375, + "learning_rate": 9.796e-07, + "loss": 0.0234, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 306 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.125, + "epoch": 0.040933333333333335, + "grad_norm": 9.185405731201172, + "kl": 0.65234375, + "learning_rate": 9.795333333333333e-07, + "loss": 0.0261, + "reward": 1.0625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 307 + }, + { + "clip_ratio": 0.0, + "completion_length": 28.375, + "epoch": 0.04106666666666667, + "grad_norm": 11.818926811218262, + "kl": 0.91796875, + "learning_rate": 9.794666666666667e-07, + "loss": 0.0366, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 308 + }, + { + "clip_ratio": 0.0, + "completion_length": 32.0625, + "epoch": 0.0412, + "grad_norm": 11.160907745361328, + "kl": 0.884765625, + "learning_rate": 9.794e-07, + "loss": 0.0354, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 309 + }, + { + "clip_ratio": 0.0, + "completion_length": 44.75, + "epoch": 0.04133333333333333, + "grad_norm": 12.5053129196167, + "kl": 0.869140625, + "learning_rate": 9.793333333333333e-07, + "loss": 0.0347, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 310 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.4375, + "epoch": 0.041466666666666666, + "grad_norm": 17.424509048461914, + "kl": 0.775390625, + "learning_rate": 9.792666666666665e-07, + "loss": 0.031, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 311 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.75, + "epoch": 0.0416, + "grad_norm": 11.972318649291992, + "kl": 0.833984375, + "learning_rate": 9.791999999999999e-07, + "loss": 0.0333, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 312 + }, + { + "clip_ratio": 0.0, + "completion_length": 27.375, + "epoch": 0.04173333333333333, + "grad_norm": 14.554648399353027, + "kl": 0.97265625, + "learning_rate": 9.791333333333333e-07, + "loss": 0.039, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 313 + }, + { + "clip_ratio": 0.0, + "completion_length": 18.375, + "epoch": 0.04186666666666667, + "grad_norm": 1.2648481130599976, + "kl": 1.009765625, + "learning_rate": 9.790666666666667e-07, + "loss": 0.0405, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 314 + }, + { + "clip_ratio": 0.0, + "completion_length": 17.0625, + "epoch": 0.042, + "grad_norm": 16.75462532043457, + "kl": 0.95703125, + "learning_rate": 9.789999999999999e-07, + "loss": 0.0383, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 315 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.6875, + "epoch": 0.042133333333333335, + "grad_norm": 10.437578201293945, + "kl": 0.80859375, + "learning_rate": 9.789333333333333e-07, + "loss": 0.0324, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 316 + }, + { + "clip_ratio": 0.0, + "completion_length": 24.0625, + "epoch": 0.04226666666666667, + "grad_norm": 16.345964431762695, + "kl": 1.06640625, + "learning_rate": 9.788666666666667e-07, + "loss": 0.0425, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 317 + }, + { + "clip_ratio": 0.0, + "completion_length": 28.9375, + "epoch": 0.0424, + "grad_norm": 17.60611915588379, + "kl": 1.046875, + "learning_rate": 9.788e-07, + "loss": 0.0419, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 318 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.0, + "epoch": 0.04253333333333333, + "grad_norm": 26.33865737915039, + "kl": 0.935546875, + "learning_rate": 9.787333333333332e-07, + "loss": 0.0375, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 319 + }, + { + "clip_ratio": 0.0, + "completion_length": 44.8125, + "epoch": 0.042666666666666665, + "grad_norm": 16.802326202392578, + "kl": 0.6220703125, + "learning_rate": 9.786666666666666e-07, + "loss": 0.0249, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 320 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.0, + "epoch": 0.0428, + "grad_norm": 244.73745727539062, + "kl": 1.8671875, + "learning_rate": 9.786e-07, + "loss": 0.0747, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 321 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0625, + "epoch": 0.04293333333333333, + "grad_norm": 25.364192962646484, + "kl": 0.91796875, + "learning_rate": 9.785333333333332e-07, + "loss": 0.0367, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 322 + }, + { + "clip_ratio": 0.0, + "completion_length": 29.875, + "epoch": 0.04306666666666667, + "grad_norm": 117.90184020996094, + "kl": 0.94140625, + "learning_rate": 9.784666666666666e-07, + "loss": 0.0378, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 323 + }, + { + "clip_ratio": 0.0, + "completion_length": 27.875, + "epoch": 0.0432, + "grad_norm": 17.863727569580078, + "kl": 0.96484375, + "learning_rate": 9.784e-07, + "loss": 0.0386, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 324 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.375, + "epoch": 0.043333333333333335, + "grad_norm": 5.408234119415283, + "kl": 0.697265625, + "learning_rate": 9.783333333333334e-07, + "loss": 0.0279, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 325 + }, + { + "clip_ratio": 0.0, + "completion_length": 31.6875, + "epoch": 0.04346666666666667, + "grad_norm": 13.518305778503418, + "kl": 0.771484375, + "learning_rate": 9.782666666666666e-07, + "loss": 0.0309, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 326 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.3125, + "epoch": 0.0436, + "grad_norm": 11.743226051330566, + "kl": 0.4619140625, + "learning_rate": 9.782e-07, + "loss": 0.0184, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 327 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.875, + "epoch": 0.04373333333333333, + "grad_norm": 858.3458251953125, + "kl": 0.4345703125, + "learning_rate": 9.781333333333332e-07, + "loss": 0.0174, + "reward": 1.125, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.875, + "step": 328 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.75, + "epoch": 0.043866666666666665, + "grad_norm": 23.823272705078125, + "kl": 0.82421875, + "learning_rate": 9.780666666666666e-07, + "loss": 0.033, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 329 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.9375, + "epoch": 0.044, + "grad_norm": 16.229509353637695, + "kl": 0.533203125, + "learning_rate": 9.78e-07, + "loss": 0.0213, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 330 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.25, + "epoch": 0.04413333333333333, + "grad_norm": 11.00096607208252, + "kl": 0.76171875, + "learning_rate": 9.779333333333332e-07, + "loss": 0.0304, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 331 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0625, + "epoch": 0.04426666666666667, + "grad_norm": 17.93421173095703, + "kl": 0.6796875, + "learning_rate": 9.778666666666666e-07, + "loss": 0.0272, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 332 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.0, + "epoch": 0.0444, + "grad_norm": 18.12625503540039, + "kl": 0.712890625, + "learning_rate": 9.778e-07, + "loss": 0.0285, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 333 + }, + { + "clip_ratio": 0.0, + "completion_length": 47.8125, + "epoch": 0.044533333333333334, + "grad_norm": 93.63096618652344, + "kl": 0.51953125, + "learning_rate": 9.777333333333334e-07, + "loss": 0.0208, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 334 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.125, + "epoch": 0.04466666666666667, + "grad_norm": 12.1994047164917, + "kl": 0.5419921875, + "learning_rate": 9.776666666666666e-07, + "loss": 0.0217, + "reward": 1.0, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 335 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.1875, + "epoch": 0.0448, + "grad_norm": 8.690802574157715, + "kl": 0.634765625, + "learning_rate": 9.776e-07, + "loss": 0.0254, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 336 + }, + { + "clip_ratio": 0.0, + "completion_length": 30.375, + "epoch": 0.04493333333333333, + "grad_norm": 23.208755493164062, + "kl": 0.771484375, + "learning_rate": 9.775333333333334e-07, + "loss": 0.0309, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 337 + }, + { + "clip_ratio": 0.0, + "completion_length": 25.8125, + "epoch": 0.045066666666666665, + "grad_norm": 15.945521354675293, + "kl": 0.845703125, + "learning_rate": 9.774666666666668e-07, + "loss": 0.0339, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 338 + }, + { + "clip_ratio": 0.0, + "completion_length": 28.75, + "epoch": 0.0452, + "grad_norm": 26.299541473388672, + "kl": 0.6640625, + "learning_rate": 9.774e-07, + "loss": 0.0265, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 339 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.0, + "epoch": 0.04533333333333334, + "grad_norm": 11.806915283203125, + "kl": 0.4775390625, + "learning_rate": 9.773333333333333e-07, + "loss": 0.0191, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 340 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.0625, + "epoch": 0.04546666666666667, + "grad_norm": 12.869094848632812, + "kl": 0.6484375, + "learning_rate": 9.772666666666665e-07, + "loss": 0.026, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 341 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.125, + "epoch": 0.0456, + "grad_norm": 12.902839660644531, + "kl": 0.72265625, + "learning_rate": 9.772e-07, + "loss": 0.0289, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 342 + }, + { + "clip_ratio": 0.0, + "completion_length": 25.375, + "epoch": 0.045733333333333334, + "grad_norm": 47.247833251953125, + "kl": 0.916015625, + "learning_rate": 9.771333333333333e-07, + "loss": 0.0366, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 343 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.25, + "epoch": 0.04586666666666667, + "grad_norm": 28.29651641845703, + "kl": 0.875, + "learning_rate": 9.770666666666665e-07, + "loss": 0.0349, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 344 + }, + { + "clip_ratio": 0.0, + "completion_length": 22.0, + "epoch": 0.046, + "grad_norm": 18.99875831604004, + "kl": 1.052734375, + "learning_rate": 9.77e-07, + "loss": 0.0421, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 345 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.0, + "epoch": 0.04613333333333333, + "grad_norm": 21.13418960571289, + "kl": 0.919921875, + "learning_rate": 9.769333333333333e-07, + "loss": 0.0368, + "reward": 1.5625, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 346 + }, + { + "clip_ratio": 0.0, + "completion_length": 15.375, + "epoch": 0.046266666666666664, + "grad_norm": 18.35637855529785, + "kl": 1.125, + "learning_rate": 9.768666666666667e-07, + "loss": 0.0449, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 347 + }, + { + "clip_ratio": 0.0, + "completion_length": 16.375, + "epoch": 0.0464, + "grad_norm": 14.507014274597168, + "kl": 1.125, + "learning_rate": 9.768e-07, + "loss": 0.0451, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 348 + }, + { + "clip_ratio": 0.0, + "completion_length": 32.0, + "epoch": 0.046533333333333336, + "grad_norm": 14.989091873168945, + "kl": 0.724609375, + "learning_rate": 9.767333333333333e-07, + "loss": 0.029, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 349 + }, + { + "clip_ratio": 0.0, + "completion_length": 26.0, + "epoch": 0.04666666666666667, + "grad_norm": 15.997438430786133, + "kl": 0.701171875, + "learning_rate": 9.766666666666667e-07, + "loss": 0.028, + "reward": 1.3125, + "reward_std": 0.5876962244510651, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 350 + }, + { + "clip_ratio": 0.0, + "completion_length": 30.8125, + "epoch": 0.0468, + "grad_norm": 19.441484451293945, + "kl": 1.01171875, + "learning_rate": 9.765999999999999e-07, + "loss": 0.0405, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 351 + }, + { + "clip_ratio": 0.0, + "completion_length": 20.875, + "epoch": 0.046933333333333334, + "grad_norm": 16.585491180419922, + "kl": 0.927734375, + "learning_rate": 9.765333333333333e-07, + "loss": 0.037, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 352 + }, + { + "clip_ratio": 0.0, + "completion_length": 24.5, + "epoch": 0.047066666666666666, + "grad_norm": 18.52923583984375, + "kl": 1.072265625, + "learning_rate": 9.764666666666667e-07, + "loss": 0.0429, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 353 + }, + { + "clip_ratio": 0.0, + "completion_length": 24.375, + "epoch": 0.0472, + "grad_norm": 26.621543884277344, + "kl": 1.3046875, + "learning_rate": 9.764e-07, + "loss": 0.0523, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 354 + }, + { + "clip_ratio": 0.0, + "completion_length": 24.9375, + "epoch": 0.04733333333333333, + "grad_norm": 50.76971435546875, + "kl": 0.7890625, + "learning_rate": 9.763333333333333e-07, + "loss": 0.0315, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 355 + }, + { + "clip_ratio": 0.0, + "completion_length": 28.4375, + "epoch": 0.047466666666666664, + "grad_norm": 32.94147872924805, + "kl": 0.978515625, + "learning_rate": 9.762666666666667e-07, + "loss": 0.0392, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 356 + }, + { + "clip_ratio": 0.0, + "completion_length": 25.4375, + "epoch": 0.0476, + "grad_norm": 47.182621002197266, + "kl": 0.9765625, + "learning_rate": 9.761999999999999e-07, + "loss": 0.039, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 357 + }, + { + "clip_ratio": 0.0, + "completion_length": 29.8125, + "epoch": 0.047733333333333336, + "grad_norm": 14.349010467529297, + "kl": 0.84765625, + "learning_rate": 9.761333333333333e-07, + "loss": 0.0339, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 358 + }, + { + "clip_ratio": 0.0, + "completion_length": 25.0625, + "epoch": 0.04786666666666667, + "grad_norm": 16.49839973449707, + "kl": 0.810546875, + "learning_rate": 9.760666666666667e-07, + "loss": 0.0324, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 359 + }, + { + "clip_ratio": 0.0, + "completion_length": 24.4375, + "epoch": 0.048, + "grad_norm": 26.486318588256836, + "kl": 0.880859375, + "learning_rate": 9.759999999999998e-07, + "loss": 0.0351, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 360 + }, + { + "clip_ratio": 0.0, + "completion_length": 48.125, + "epoch": 0.048133333333333334, + "grad_norm": 11.675005912780762, + "kl": 0.76171875, + "learning_rate": 9.759333333333332e-07, + "loss": 0.0304, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 361 + }, + { + "clip_ratio": 0.0, + "completion_length": 42.25, + "epoch": 0.048266666666666666, + "grad_norm": 13.540971755981445, + "kl": 0.873046875, + "learning_rate": 9.758666666666666e-07, + "loss": 0.0349, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 362 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.625, + "epoch": 0.0484, + "grad_norm": 19.289451599121094, + "kl": 0.94140625, + "learning_rate": 9.758e-07, + "loss": 0.0376, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 363 + }, + { + "clip_ratio": 0.0, + "completion_length": 31.0, + "epoch": 0.04853333333333333, + "grad_norm": 13.136825561523438, + "kl": 1.056640625, + "learning_rate": 9.757333333333332e-07, + "loss": 0.0423, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 364 + }, + { + "clip_ratio": 0.0, + "completion_length": 30.75, + "epoch": 0.048666666666666664, + "grad_norm": 9.987053871154785, + "kl": 0.9453125, + "learning_rate": 9.756666666666666e-07, + "loss": 0.0378, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 365 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.3125, + "epoch": 0.0488, + "grad_norm": 48.735130310058594, + "kl": 0.845703125, + "learning_rate": 9.756e-07, + "loss": 0.0339, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 366 + }, + { + "clip_ratio": 0.0, + "completion_length": 27.125, + "epoch": 0.048933333333333336, + "grad_norm": 12.598526954650879, + "kl": 0.533203125, + "learning_rate": 9.755333333333332e-07, + "loss": 0.0214, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 367 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.1875, + "epoch": 0.04906666666666667, + "grad_norm": 11.14826774597168, + "kl": 0.640625, + "learning_rate": 9.754666666666666e-07, + "loss": 0.0256, + "reward": 1.25, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 368 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.375, + "epoch": 0.0492, + "grad_norm": 21.126495361328125, + "kl": 0.732421875, + "learning_rate": 9.754e-07, + "loss": 0.0293, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 369 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.4375, + "epoch": 0.04933333333333333, + "grad_norm": 14.778359413146973, + "kl": 0.5703125, + "learning_rate": 9.753333333333334e-07, + "loss": 0.0227, + "reward": 1.375, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 370 + }, + { + "clip_ratio": 0.0, + "completion_length": 27.6875, + "epoch": 0.049466666666666666, + "grad_norm": 35.639678955078125, + "kl": 0.724609375, + "learning_rate": 9.752666666666666e-07, + "loss": 0.029, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 371 + }, + { + "clip_ratio": 0.0, + "completion_length": 19.0, + "epoch": 0.0496, + "grad_norm": 53.25807189941406, + "kl": 0.763671875, + "learning_rate": 9.752e-07, + "loss": 0.0305, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 372 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.1875, + "epoch": 0.04973333333333333, + "grad_norm": 11.609034538269043, + "kl": 0.708984375, + "learning_rate": 9.751333333333332e-07, + "loss": 0.0284, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 373 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.75, + "epoch": 0.04986666666666666, + "grad_norm": 10.448272705078125, + "kl": 0.5380859375, + "learning_rate": 9.750666666666666e-07, + "loss": 0.0215, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 374 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.9375, + "epoch": 0.05, + "grad_norm": 1.2738537788391113, + "kl": 0.6044921875, + "learning_rate": 9.75e-07, + "loss": 0.0242, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 375 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.375, + "epoch": 0.050133333333333335, + "grad_norm": 17.47621726989746, + "kl": 0.5009765625, + "learning_rate": 9.749333333333332e-07, + "loss": 0.02, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 376 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.375, + "epoch": 0.05026666666666667, + "grad_norm": 9.222245216369629, + "kl": 0.4873046875, + "learning_rate": 9.748666666666666e-07, + "loss": 0.0194, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 377 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.8125, + "epoch": 0.0504, + "grad_norm": 12.00571060180664, + "kl": 0.49609375, + "learning_rate": 9.748e-07, + "loss": 0.0199, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 378 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.0, + "epoch": 0.05053333333333333, + "grad_norm": 11.745697975158691, + "kl": 0.369140625, + "learning_rate": 9.747333333333334e-07, + "loss": 0.0147, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 379 + }, + { + "clip_ratio": 0.0, + "completion_length": 27.4375, + "epoch": 0.050666666666666665, + "grad_norm": 12.852128982543945, + "kl": 0.548828125, + "learning_rate": 9.746666666666666e-07, + "loss": 0.0219, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 380 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.4375, + "epoch": 0.0508, + "grad_norm": 7.962526321411133, + "kl": 0.4814453125, + "learning_rate": 9.746e-07, + "loss": 0.0192, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 381 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.25, + "epoch": 0.05093333333333333, + "grad_norm": 17.810585021972656, + "kl": 0.416015625, + "learning_rate": 9.745333333333334e-07, + "loss": 0.0167, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 382 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.375, + "epoch": 0.05106666666666667, + "grad_norm": 13.061534881591797, + "kl": 0.296875, + "learning_rate": 9.744666666666668e-07, + "loss": 0.0119, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 383 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.0, + "epoch": 0.0512, + "grad_norm": 13.507783889770508, + "kl": 0.5087890625, + "learning_rate": 9.744e-07, + "loss": 0.0204, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 384 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.4375, + "epoch": 0.051333333333333335, + "grad_norm": 9.285703659057617, + "kl": 0.271484375, + "learning_rate": 9.743333333333333e-07, + "loss": 0.0108, + "reward": 1.4375, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 385 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.125, + "epoch": 0.05146666666666667, + "grad_norm": 13.302173614501953, + "kl": 0.21435546875, + "learning_rate": 9.742666666666665e-07, + "loss": 0.0086, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 386 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.6875, + "epoch": 0.0516, + "grad_norm": 10.096393585205078, + "kl": 0.23046875, + "learning_rate": 9.742e-07, + "loss": 0.0092, + "reward": 1.5, + "reward_std": 0.5940381735563278, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 387 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.1875, + "epoch": 0.05173333333333333, + "grad_norm": 72.381103515625, + "kl": 0.3125, + "learning_rate": 9.741333333333333e-07, + "loss": 0.0125, + "reward": 1.6875, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 388 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.6875, + "epoch": 0.051866666666666665, + "grad_norm": 18.13732147216797, + "kl": 0.3349609375, + "learning_rate": 9.740666666666665e-07, + "loss": 0.0134, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 389 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.4375, + "epoch": 0.052, + "grad_norm": 9.507047653198242, + "kl": 0.137939453125, + "learning_rate": 9.74e-07, + "loss": 0.0055, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 390 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.625, + "epoch": 0.05213333333333333, + "grad_norm": 13.085759162902832, + "kl": 0.43896484375, + "learning_rate": 9.739333333333333e-07, + "loss": 0.0175, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 391 + }, + { + "clip_ratio": 0.0, + "completion_length": 46.0625, + "epoch": 0.05226666666666667, + "grad_norm": 1.8205143213272095, + "kl": 0.3828125, + "learning_rate": 9.738666666666667e-07, + "loss": 0.0153, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 392 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.75, + "epoch": 0.0524, + "grad_norm": 19.239648818969727, + "kl": 0.2275390625, + "learning_rate": 9.737999999999999e-07, + "loss": 0.0091, + "reward": 1.375, + "reward_std": 0.579209566116333, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.8125, + "step": 393 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.625, + "epoch": 0.052533333333333335, + "grad_norm": 8.261813163757324, + "kl": 0.15966796875, + "learning_rate": 9.737333333333333e-07, + "loss": 0.0064, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 394 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.8125, + "epoch": 0.05266666666666667, + "grad_norm": 33.83839797973633, + "kl": 0.494140625, + "learning_rate": 9.736666666666667e-07, + "loss": 0.0198, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 395 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.5, + "epoch": 0.0528, + "grad_norm": 12.379755020141602, + "kl": 0.22412109375, + "learning_rate": 9.735999999999999e-07, + "loss": 0.009, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 396 + }, + { + "clip_ratio": 0.0, + "completion_length": 46.6875, + "epoch": 0.05293333333333333, + "grad_norm": 13.60884952545166, + "kl": 0.759765625, + "learning_rate": 9.735333333333333e-07, + "loss": 0.0304, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 397 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.375, + "epoch": 0.053066666666666665, + "grad_norm": 52.72901916503906, + "kl": 0.2724609375, + "learning_rate": 9.734666666666667e-07, + "loss": 0.0109, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 398 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.875, + "epoch": 0.0532, + "grad_norm": 10.786269187927246, + "kl": 0.458984375, + "learning_rate": 9.734e-07, + "loss": 0.0184, + "reward": 1.6875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 399 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.5625, + "epoch": 0.05333333333333334, + "grad_norm": 8.141532897949219, + "kl": 0.3564453125, + "learning_rate": 9.733333333333333e-07, + "loss": 0.0143, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 400 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.8125, + "epoch": 0.05346666666666667, + "grad_norm": 17.735092163085938, + "kl": 0.3232421875, + "learning_rate": 9.732666666666667e-07, + "loss": 0.013, + "reward": 1.25, + "reward_std": 0.579209566116333, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 401 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.9375, + "epoch": 0.0536, + "grad_norm": 17.15247344970703, + "kl": 0.30712890625, + "learning_rate": 9.731999999999998e-07, + "loss": 0.0123, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 402 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.875, + "epoch": 0.053733333333333334, + "grad_norm": 59.046600341796875, + "kl": 1.970703125, + "learning_rate": 9.731333333333332e-07, + "loss": 0.0788, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 403 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.75, + "epoch": 0.05386666666666667, + "grad_norm": 18.951587677001953, + "kl": 0.86328125, + "learning_rate": 9.730666666666666e-07, + "loss": 0.0346, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 404 + }, + { + "clip_ratio": 0.0, + "completion_length": 53.9375, + "epoch": 0.054, + "grad_norm": 14.954550743103027, + "kl": 0.19873046875, + "learning_rate": 9.729999999999998e-07, + "loss": 0.0079, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 405 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.6875, + "epoch": 0.05413333333333333, + "grad_norm": 34.70091247558594, + "kl": 0.22705078125, + "learning_rate": 9.729333333333332e-07, + "loss": 0.0091, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 406 + }, + { + "clip_ratio": 0.0, + "completion_length": 46.75, + "epoch": 0.054266666666666664, + "grad_norm": 12.623210906982422, + "kl": 0.5361328125, + "learning_rate": 9.728666666666666e-07, + "loss": 0.0214, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 407 + }, + { + "clip_ratio": 0.0, + "completion_length": 30.4375, + "epoch": 0.0544, + "grad_norm": 14.480650901794434, + "kl": 0.548828125, + "learning_rate": 9.728e-07, + "loss": 0.022, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 408 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.1875, + "epoch": 0.054533333333333336, + "grad_norm": 6.876080513000488, + "kl": 0.3525390625, + "learning_rate": 9.727333333333332e-07, + "loss": 0.0141, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.875, + "step": 409 + }, + { + "clip_ratio": 0.0, + "completion_length": 52.5, + "epoch": 0.05466666666666667, + "grad_norm": 10.990650177001953, + "kl": 0.353515625, + "learning_rate": 9.726666666666666e-07, + "loss": 0.0142, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 410 + }, + { + "clip_ratio": 0.0, + "completion_length": 44.3125, + "epoch": 0.0548, + "grad_norm": 15.426736831665039, + "kl": 0.33154296875, + "learning_rate": 9.726e-07, + "loss": 0.0133, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 411 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.25, + "epoch": 0.054933333333333334, + "grad_norm": 10.265754699707031, + "kl": 0.353515625, + "learning_rate": 9.725333333333334e-07, + "loss": 0.0142, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 412 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.6875, + "epoch": 0.05506666666666667, + "grad_norm": 11.173418045043945, + "kl": 0.3349609375, + "learning_rate": 9.724666666666666e-07, + "loss": 0.0134, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 413 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.6875, + "epoch": 0.0552, + "grad_norm": 19.249183654785156, + "kl": 0.44140625, + "learning_rate": 9.724e-07, + "loss": 0.0177, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 414 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.625, + "epoch": 0.05533333333333333, + "grad_norm": 14.902363777160645, + "kl": 0.404296875, + "learning_rate": 9.723333333333334e-07, + "loss": 0.0162, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 415 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.4375, + "epoch": 0.055466666666666664, + "grad_norm": 33.99069595336914, + "kl": 0.4150390625, + "learning_rate": 9.722666666666666e-07, + "loss": 0.0166, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 416 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.1875, + "epoch": 0.0556, + "grad_norm": 15.210176467895508, + "kl": 0.3671875, + "learning_rate": 9.722e-07, + "loss": 0.0147, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 417 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.625, + "epoch": 0.055733333333333336, + "grad_norm": 12.849628448486328, + "kl": 0.4267578125, + "learning_rate": 9.721333333333332e-07, + "loss": 0.0171, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 418 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.9375, + "epoch": 0.05586666666666667, + "grad_norm": 12.873249053955078, + "kl": 0.470703125, + "learning_rate": 9.720666666666666e-07, + "loss": 0.0188, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 419 + }, + { + "clip_ratio": 0.0, + "completion_length": 32.0, + "epoch": 0.056, + "grad_norm": 22.007381439208984, + "kl": 0.55078125, + "learning_rate": 9.72e-07, + "loss": 0.022, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 420 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.1875, + "epoch": 0.056133333333333334, + "grad_norm": 10.312549591064453, + "kl": 0.5703125, + "learning_rate": 9.719333333333334e-07, + "loss": 0.0228, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 421 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.5625, + "epoch": 0.056266666666666666, + "grad_norm": 20.561447143554688, + "kl": 0.658203125, + "learning_rate": 9.718666666666666e-07, + "loss": 0.0263, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 422 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.8125, + "epoch": 0.0564, + "grad_norm": 26.559938430786133, + "kl": 0.701171875, + "learning_rate": 9.718e-07, + "loss": 0.0281, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 423 + }, + { + "clip_ratio": 0.0, + "completion_length": 17.5625, + "epoch": 0.05653333333333333, + "grad_norm": 18.858579635620117, + "kl": 0.5859375, + "learning_rate": 9.717333333333334e-07, + "loss": 0.0234, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 424 + }, + { + "clip_ratio": 0.0, + "completion_length": 20.0625, + "epoch": 0.056666666666666664, + "grad_norm": 159.29612731933594, + "kl": 5.052734375, + "learning_rate": 9.716666666666665e-07, + "loss": 0.2025, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 425 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.75, + "epoch": 0.0568, + "grad_norm": 27.15127182006836, + "kl": 0.55078125, + "learning_rate": 9.716e-07, + "loss": 0.022, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 426 + }, + { + "clip_ratio": 0.0, + "completion_length": 21.5, + "epoch": 0.056933333333333336, + "grad_norm": 1.5482721328735352, + "kl": 0.55859375, + "learning_rate": 9.715333333333333e-07, + "loss": 0.0223, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 427 + }, + { + "clip_ratio": 0.0, + "completion_length": 30.5625, + "epoch": 0.05706666666666667, + "grad_norm": 25.373706817626953, + "kl": 0.580078125, + "learning_rate": 9.714666666666667e-07, + "loss": 0.0232, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 428 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.875, + "epoch": 0.0572, + "grad_norm": 12.387924194335938, + "kl": 0.53515625, + "learning_rate": 9.714e-07, + "loss": 0.0214, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 429 + }, + { + "clip_ratio": 0.0, + "completion_length": 14.1875, + "epoch": 0.05733333333333333, + "grad_norm": 41.537879943847656, + "kl": 1.390625, + "learning_rate": 9.713333333333333e-07, + "loss": 0.0557, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 430 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.6875, + "epoch": 0.057466666666666666, + "grad_norm": 31.619590759277344, + "kl": 0.4736328125, + "learning_rate": 9.712666666666665e-07, + "loss": 0.0189, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 431 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.625, + "epoch": 0.0576, + "grad_norm": 62.20487594604492, + "kl": 0.65234375, + "learning_rate": 9.712e-07, + "loss": 0.0262, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 432 + }, + { + "clip_ratio": 0.0, + "completion_length": 26.375, + "epoch": 0.05773333333333333, + "grad_norm": 28.121431350708008, + "kl": 0.53515625, + "learning_rate": 9.711333333333333e-07, + "loss": 0.0214, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 433 + }, + { + "clip_ratio": 0.0, + "completion_length": 31.375, + "epoch": 0.057866666666666663, + "grad_norm": 23.391422271728516, + "kl": 0.615234375, + "learning_rate": 9.710666666666665e-07, + "loss": 0.0246, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 434 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.0, + "epoch": 0.058, + "grad_norm": 11.754870414733887, + "kl": 0.25341796875, + "learning_rate": 9.709999999999999e-07, + "loss": 0.0101, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 435 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.6875, + "epoch": 0.058133333333333335, + "grad_norm": 14.770730972290039, + "kl": 0.19873046875, + "learning_rate": 9.709333333333333e-07, + "loss": 0.008, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 436 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.375, + "epoch": 0.05826666666666667, + "grad_norm": 36.90768814086914, + "kl": 0.380859375, + "learning_rate": 9.708666666666667e-07, + "loss": 0.0152, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 437 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.9375, + "epoch": 0.0584, + "grad_norm": 16.566545486450195, + "kl": 0.2353515625, + "learning_rate": 9.707999999999999e-07, + "loss": 0.0094, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 438 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.25, + "epoch": 0.05853333333333333, + "grad_norm": 33.95064926147461, + "kl": 0.3583984375, + "learning_rate": 9.707333333333333e-07, + "loss": 0.0144, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 439 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.9375, + "epoch": 0.058666666666666666, + "grad_norm": 96.6959228515625, + "kl": 0.328125, + "learning_rate": 9.706666666666667e-07, + "loss": 0.0131, + "reward": 1.0625, + "reward_std": 0.5876962244510651, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.875, + "step": 440 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.25, + "epoch": 0.0588, + "grad_norm": 6.6185221672058105, + "kl": 0.2119140625, + "learning_rate": 9.706e-07, + "loss": 0.0085, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 441 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.125, + "epoch": 0.05893333333333333, + "grad_norm": 8.816529273986816, + "kl": 0.2958984375, + "learning_rate": 9.705333333333333e-07, + "loss": 0.0119, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 442 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.25, + "epoch": 0.05906666666666667, + "grad_norm": 72.35953521728516, + "kl": 0.201171875, + "learning_rate": 9.704666666666667e-07, + "loss": 0.008, + "reward": 1.625, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 443 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.0625, + "epoch": 0.0592, + "grad_norm": 12.824825286865234, + "kl": 0.188232421875, + "learning_rate": 9.704e-07, + "loss": 0.0075, + "reward": 1.125, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 444 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.375, + "epoch": 0.059333333333333335, + "grad_norm": 10.250066757202148, + "kl": 0.2099609375, + "learning_rate": 9.703333333333332e-07, + "loss": 0.0084, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 445 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.1875, + "epoch": 0.05946666666666667, + "grad_norm": 9.749994277954102, + "kl": 0.22119140625, + "learning_rate": 9.702666666666666e-07, + "loss": 0.0088, + "reward": 1.5625, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 446 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.0625, + "epoch": 0.0596, + "grad_norm": 9.993621826171875, + "kl": 0.130126953125, + "learning_rate": 9.701999999999998e-07, + "loss": 0.0052, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 447 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.5625, + "epoch": 0.05973333333333333, + "grad_norm": 12.347878456115723, + "kl": 0.1416015625, + "learning_rate": 9.701333333333332e-07, + "loss": 0.0057, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 448 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.625, + "epoch": 0.059866666666666665, + "grad_norm": 11.708304405212402, + "kl": 0.20458984375, + "learning_rate": 9.700666666666666e-07, + "loss": 0.0082, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 449 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.0625, + "epoch": 0.06, + "grad_norm": 13.454201698303223, + "kl": 0.37744140625, + "learning_rate": 9.7e-07, + "loss": 0.0151, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 450 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.5, + "epoch": 0.06013333333333333, + "grad_norm": 96.9183349609375, + "kl": 0.4111328125, + "learning_rate": 9.699333333333332e-07, + "loss": 0.0164, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 451 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.625, + "epoch": 0.06026666666666667, + "grad_norm": 12.386007308959961, + "kl": 0.4541015625, + "learning_rate": 9.698666666666666e-07, + "loss": 0.0182, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 452 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.1875, + "epoch": 0.0604, + "grad_norm": 11.61029052734375, + "kl": 0.232421875, + "learning_rate": 9.698e-07, + "loss": 0.0093, + "reward": 1.25, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 453 + }, + { + "clip_ratio": 0.0, + "completion_length": 31.375, + "epoch": 0.060533333333333335, + "grad_norm": 30.595659255981445, + "kl": 0.4189453125, + "learning_rate": 9.697333333333332e-07, + "loss": 0.0168, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 454 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.75, + "epoch": 0.06066666666666667, + "grad_norm": 18.976865768432617, + "kl": 0.49853515625, + "learning_rate": 9.696666666666666e-07, + "loss": 0.02, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 455 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.75, + "epoch": 0.0608, + "grad_norm": 8.336684226989746, + "kl": 0.376953125, + "learning_rate": 9.696e-07, + "loss": 0.0151, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 456 + }, + { + "clip_ratio": 0.0, + "completion_length": 42.3125, + "epoch": 0.06093333333333333, + "grad_norm": 15.61872386932373, + "kl": 0.4609375, + "learning_rate": 9.695333333333334e-07, + "loss": 0.0184, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 457 + }, + { + "clip_ratio": 0.0, + "completion_length": 29.6875, + "epoch": 0.061066666666666665, + "grad_norm": 18.142169952392578, + "kl": 0.810546875, + "learning_rate": 9.694666666666666e-07, + "loss": 0.0324, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 458 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.9375, + "epoch": 0.0612, + "grad_norm": 7.288699150085449, + "kl": 0.4375, + "learning_rate": 9.694e-07, + "loss": 0.0175, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 459 + }, + { + "clip_ratio": 0.0, + "completion_length": 47.75, + "epoch": 0.06133333333333333, + "grad_norm": 16.548452377319336, + "kl": 0.5205078125, + "learning_rate": 9.693333333333334e-07, + "loss": 0.0209, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 460 + }, + { + "clip_ratio": 0.0, + "completion_length": 30.6875, + "epoch": 0.06146666666666667, + "grad_norm": 15.561515808105469, + "kl": 0.740234375, + "learning_rate": 9.692666666666666e-07, + "loss": 0.0296, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 461 + }, + { + "clip_ratio": 0.0, + "completion_length": 30.3125, + "epoch": 0.0616, + "grad_norm": 17.573766708374023, + "kl": 0.744140625, + "learning_rate": 9.692e-07, + "loss": 0.0298, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 462 + }, + { + "clip_ratio": 0.0, + "completion_length": 43.5, + "epoch": 0.061733333333333335, + "grad_norm": 17.27387046813965, + "kl": 0.673828125, + "learning_rate": 9.691333333333332e-07, + "loss": 0.0269, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 463 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.625, + "epoch": 0.06186666666666667, + "grad_norm": 24.19248390197754, + "kl": 0.4560546875, + "learning_rate": 9.690666666666666e-07, + "loss": 0.0183, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 464 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.1875, + "epoch": 0.062, + "grad_norm": 8.30177116394043, + "kl": 0.716796875, + "learning_rate": 9.69e-07, + "loss": 0.0287, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 465 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.375, + "epoch": 0.06213333333333333, + "grad_norm": 60.83865737915039, + "kl": 0.3515625, + "learning_rate": 9.689333333333334e-07, + "loss": 0.0141, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 466 + }, + { + "clip_ratio": 0.0, + "completion_length": 21.1875, + "epoch": 0.062266666666666665, + "grad_norm": 22.40559959411621, + "kl": 0.849609375, + "learning_rate": 9.688666666666665e-07, + "loss": 0.0339, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 467 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.8125, + "epoch": 0.0624, + "grad_norm": 13.420199394226074, + "kl": 0.31201171875, + "learning_rate": 9.688e-07, + "loss": 0.0125, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 468 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.3125, + "epoch": 0.06253333333333333, + "grad_norm": 11.516633033752441, + "kl": 0.66796875, + "learning_rate": 9.687333333333333e-07, + "loss": 0.0267, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 469 + }, + { + "clip_ratio": 0.0, + "completion_length": 18.5, + "epoch": 0.06266666666666666, + "grad_norm": 35.60871887207031, + "kl": 0.638671875, + "learning_rate": 9.686666666666667e-07, + "loss": 0.0256, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 470 + }, + { + "clip_ratio": 0.0, + "completion_length": 44.5, + "epoch": 0.0628, + "grad_norm": 13.774245262145996, + "kl": 0.6015625, + "learning_rate": 9.686e-07, + "loss": 0.0241, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 471 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.125, + "epoch": 0.06293333333333333, + "grad_norm": 19.49705696105957, + "kl": 0.4423828125, + "learning_rate": 9.685333333333333e-07, + "loss": 0.0177, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 472 + }, + { + "clip_ratio": 0.0, + "completion_length": 20.875, + "epoch": 0.06306666666666666, + "grad_norm": 12.603809356689453, + "kl": 0.697265625, + "learning_rate": 9.684666666666667e-07, + "loss": 0.0279, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 473 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.625, + "epoch": 0.0632, + "grad_norm": 25.171119689941406, + "kl": 0.3681640625, + "learning_rate": 9.684e-07, + "loss": 0.0147, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 474 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.75, + "epoch": 0.06333333333333334, + "grad_norm": 17.066482543945312, + "kl": 0.482421875, + "learning_rate": 9.683333333333333e-07, + "loss": 0.0193, + "reward": 1.375, + "reward_std": 0.579209566116333, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 475 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.25, + "epoch": 0.06346666666666667, + "grad_norm": 17.17740821838379, + "kl": 0.3837890625, + "learning_rate": 9.682666666666667e-07, + "loss": 0.0153, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 476 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.5, + "epoch": 0.0636, + "grad_norm": 6.654721736907959, + "kl": 0.4384765625, + "learning_rate": 9.682e-07, + "loss": 0.0175, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 477 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.5, + "epoch": 0.06373333333333334, + "grad_norm": 13.037439346313477, + "kl": 0.421875, + "learning_rate": 9.681333333333333e-07, + "loss": 0.0168, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 478 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.625, + "epoch": 0.06386666666666667, + "grad_norm": 13.708234786987305, + "kl": 0.4638671875, + "learning_rate": 9.680666666666667e-07, + "loss": 0.0185, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 479 + }, + { + "clip_ratio": 0.0, + "completion_length": 26.6875, + "epoch": 0.064, + "grad_norm": 22.003662109375, + "kl": 0.6025390625, + "learning_rate": 9.679999999999999e-07, + "loss": 0.0241, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 480 + }, + { + "clip_ratio": 0.0, + "completion_length": 43.1875, + "epoch": 0.06413333333333333, + "grad_norm": 2.230447292327881, + "kl": 0.5947265625, + "learning_rate": 9.679333333333333e-07, + "loss": 0.0237, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 481 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.875, + "epoch": 0.06426666666666667, + "grad_norm": 26.63912582397461, + "kl": 0.390625, + "learning_rate": 9.678666666666667e-07, + "loss": 0.0156, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 482 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.8125, + "epoch": 0.0644, + "grad_norm": 14.082049369812012, + "kl": 0.294921875, + "learning_rate": 9.677999999999999e-07, + "loss": 0.0118, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 483 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.8125, + "epoch": 0.06453333333333333, + "grad_norm": 7.383893966674805, + "kl": 0.5, + "learning_rate": 9.677333333333333e-07, + "loss": 0.0201, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 484 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.9375, + "epoch": 0.06466666666666666, + "grad_norm": 6.144572734832764, + "kl": 0.26318359375, + "learning_rate": 9.676666666666667e-07, + "loss": 0.0105, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 485 + }, + { + "clip_ratio": 0.0, + "completion_length": 43.5, + "epoch": 0.0648, + "grad_norm": 3.5889456272125244, + "kl": 0.474609375, + "learning_rate": 9.676e-07, + "loss": 0.019, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 486 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.625, + "epoch": 0.06493333333333333, + "grad_norm": 6.137612342834473, + "kl": 0.265625, + "learning_rate": 9.675333333333332e-07, + "loss": 0.0106, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 487 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.0625, + "epoch": 0.06506666666666666, + "grad_norm": 13.67559814453125, + "kl": 0.3984375, + "learning_rate": 9.674666666666666e-07, + "loss": 0.0159, + "reward": 1.3125, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 488 + }, + { + "clip_ratio": 0.0, + "completion_length": 46.5625, + "epoch": 0.0652, + "grad_norm": 1.2286144495010376, + "kl": 0.4375, + "learning_rate": 9.674e-07, + "loss": 0.0175, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 489 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.1875, + "epoch": 0.06533333333333333, + "grad_norm": 24.024499893188477, + "kl": 0.3251953125, + "learning_rate": 9.673333333333332e-07, + "loss": 0.0131, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 490 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.875, + "epoch": 0.06546666666666667, + "grad_norm": 10.141944885253906, + "kl": 0.6796875, + "learning_rate": 9.672666666666666e-07, + "loss": 0.0272, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 491 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.9375, + "epoch": 0.0656, + "grad_norm": 14.667461395263672, + "kl": 0.341796875, + "learning_rate": 9.671999999999998e-07, + "loss": 0.0137, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 492 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.6875, + "epoch": 0.06573333333333334, + "grad_norm": 7.242129802703857, + "kl": 0.357421875, + "learning_rate": 9.671333333333332e-07, + "loss": 0.0143, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 493 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.75, + "epoch": 0.06586666666666667, + "grad_norm": 9.312252044677734, + "kl": 0.2119140625, + "learning_rate": 9.670666666666666e-07, + "loss": 0.0085, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 494 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.6875, + "epoch": 0.066, + "grad_norm": 13.446245193481445, + "kl": 0.5859375, + "learning_rate": 9.67e-07, + "loss": 0.0234, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 495 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.75, + "epoch": 0.06613333333333334, + "grad_norm": 6.839236259460449, + "kl": 0.44140625, + "learning_rate": 9.669333333333332e-07, + "loss": 0.0177, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 496 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.0, + "epoch": 0.06626666666666667, + "grad_norm": 9.681421279907227, + "kl": 0.1796875, + "learning_rate": 9.668666666666666e-07, + "loss": 0.0072, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 497 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.25, + "epoch": 0.0664, + "grad_norm": 12.38398265838623, + "kl": 0.3173828125, + "learning_rate": 9.668e-07, + "loss": 0.0127, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 498 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.5, + "epoch": 0.06653333333333333, + "grad_norm": 20.054065704345703, + "kl": 0.2978515625, + "learning_rate": 9.667333333333332e-07, + "loss": 0.0119, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 499 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.5625, + "epoch": 0.06666666666666667, + "grad_norm": 16.03452491760254, + "kl": 0.31201171875, + "learning_rate": 9.666666666666666e-07, + "loss": 0.0125, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 500 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.4375, + "epoch": 0.0668, + "grad_norm": 8.62741470336914, + "kl": 0.17333984375, + "learning_rate": 9.666e-07, + "loss": 0.0069, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 501 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.625, + "epoch": 0.06693333333333333, + "grad_norm": 16.30545425415039, + "kl": 0.22607421875, + "learning_rate": 9.665333333333334e-07, + "loss": 0.009, + "reward": 1.0625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 502 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.4375, + "epoch": 0.06706666666666666, + "grad_norm": 6.870120525360107, + "kl": 0.2763671875, + "learning_rate": 9.664666666666666e-07, + "loss": 0.011, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 503 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.8125, + "epoch": 0.0672, + "grad_norm": 8.80592155456543, + "kl": 0.28515625, + "learning_rate": 9.664e-07, + "loss": 0.0114, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 504 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.75, + "epoch": 0.06733333333333333, + "grad_norm": 17.509504318237305, + "kl": 0.32763671875, + "learning_rate": 9.663333333333334e-07, + "loss": 0.0131, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 505 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.4375, + "epoch": 0.06746666666666666, + "grad_norm": 72.8773193359375, + "kl": 0.2373046875, + "learning_rate": 9.662666666666668e-07, + "loss": 0.0095, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 506 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.875, + "epoch": 0.0676, + "grad_norm": 5.0311808586120605, + "kl": 0.19287109375, + "learning_rate": 9.662e-07, + "loss": 0.0077, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 507 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.25, + "epoch": 0.06773333333333334, + "grad_norm": 9.056401252746582, + "kl": 0.1650390625, + "learning_rate": 9.661333333333331e-07, + "loss": 0.0066, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 508 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.875, + "epoch": 0.06786666666666667, + "grad_norm": 0.5020315051078796, + "kl": 0.171875, + "learning_rate": 9.660666666666665e-07, + "loss": 0.0069, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 509 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.625, + "epoch": 0.068, + "grad_norm": 10.890634536743164, + "kl": 0.224609375, + "learning_rate": 9.66e-07, + "loss": 0.009, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 510 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.125, + "epoch": 0.06813333333333334, + "grad_norm": 11.931864738464355, + "kl": 0.13623046875, + "learning_rate": 9.659333333333333e-07, + "loss": 0.0055, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 511 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.125, + "epoch": 0.06826666666666667, + "grad_norm": 8.597017288208008, + "kl": 0.16259765625, + "learning_rate": 9.658666666666665e-07, + "loss": 0.0065, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 512 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.375, + "epoch": 0.0684, + "grad_norm": 8.392215728759766, + "kl": 0.208984375, + "learning_rate": 9.658e-07, + "loss": 0.0083, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 513 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.25, + "epoch": 0.06853333333333333, + "grad_norm": 15.547290802001953, + "kl": 0.228515625, + "learning_rate": 9.657333333333333e-07, + "loss": 0.0092, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 514 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.5625, + "epoch": 0.06866666666666667, + "grad_norm": 4.506417274475098, + "kl": 0.1708984375, + "learning_rate": 9.656666666666667e-07, + "loss": 0.0068, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 515 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.8125, + "epoch": 0.0688, + "grad_norm": 10.876577377319336, + "kl": 0.2041015625, + "learning_rate": 9.656e-07, + "loss": 0.0082, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 516 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.0, + "epoch": 0.06893333333333333, + "grad_norm": 4.733780860900879, + "kl": 0.154296875, + "learning_rate": 9.655333333333333e-07, + "loss": 0.0062, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 517 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.375, + "epoch": 0.06906666666666667, + "grad_norm": 18.513338088989258, + "kl": 0.14111328125, + "learning_rate": 9.654666666666667e-07, + "loss": 0.0057, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 518 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.6875, + "epoch": 0.0692, + "grad_norm": 8.089277267456055, + "kl": 0.157958984375, + "learning_rate": 9.654e-07, + "loss": 0.0063, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 519 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.1875, + "epoch": 0.06933333333333333, + "grad_norm": 59.8182373046875, + "kl": 0.155517578125, + "learning_rate": 9.653333333333333e-07, + "loss": 0.0062, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 520 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.75, + "epoch": 0.06946666666666666, + "grad_norm": 8.925905227661133, + "kl": 0.123291015625, + "learning_rate": 9.652666666666667e-07, + "loss": 0.0049, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 521 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.875, + "epoch": 0.0696, + "grad_norm": 6.979102611541748, + "kl": 0.099609375, + "learning_rate": 9.651999999999999e-07, + "loss": 0.004, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 522 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.875, + "epoch": 0.06973333333333333, + "grad_norm": 0.4731016755104065, + "kl": 0.17041015625, + "learning_rate": 9.651333333333333e-07, + "loss": 0.0068, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 523 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.1875, + "epoch": 0.06986666666666666, + "grad_norm": 7.475340843200684, + "kl": 0.1943359375, + "learning_rate": 9.650666666666667e-07, + "loss": 0.0078, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 524 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.5625, + "epoch": 0.07, + "grad_norm": 8.191786766052246, + "kl": 0.108642578125, + "learning_rate": 9.649999999999999e-07, + "loss": 0.0043, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 525 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.375, + "epoch": 0.07013333333333334, + "grad_norm": 4.927967548370361, + "kl": 0.2412109375, + "learning_rate": 9.649333333333333e-07, + "loss": 0.0097, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 526 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.9375, + "epoch": 0.07026666666666667, + "grad_norm": 2.308570146560669, + "kl": 0.19873046875, + "learning_rate": 9.648666666666667e-07, + "loss": 0.008, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 527 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.125, + "epoch": 0.0704, + "grad_norm": 10.725434303283691, + "kl": 0.195068359375, + "learning_rate": 9.647999999999999e-07, + "loss": 0.0078, + "reward": 1.3125, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 528 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.6875, + "epoch": 0.07053333333333334, + "grad_norm": 9.536055564880371, + "kl": 0.153564453125, + "learning_rate": 9.647333333333333e-07, + "loss": 0.0061, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 529 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.1875, + "epoch": 0.07066666666666667, + "grad_norm": 9.00808334350586, + "kl": 0.162353515625, + "learning_rate": 9.646666666666666e-07, + "loss": 0.0065, + "reward": 1.4375, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 530 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.875, + "epoch": 0.0708, + "grad_norm": 7.739562034606934, + "kl": 0.154296875, + "learning_rate": 9.646e-07, + "loss": 0.0062, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 531 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.9375, + "epoch": 0.07093333333333333, + "grad_norm": 6.2417097091674805, + "kl": 0.2021484375, + "learning_rate": 9.645333333333332e-07, + "loss": 0.0081, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 532 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.1875, + "epoch": 0.07106666666666667, + "grad_norm": 17.337581634521484, + "kl": 0.15869140625, + "learning_rate": 9.644666666666666e-07, + "loss": 0.0063, + "reward": 1.375, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 533 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.1875, + "epoch": 0.0712, + "grad_norm": 10.095739364624023, + "kl": 0.208984375, + "learning_rate": 9.644e-07, + "loss": 0.0084, + "reward": 1.3125, + "reward_std": 0.6739883720874786, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.8125, + "step": 534 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.8125, + "epoch": 0.07133333333333333, + "grad_norm": 10.032266616821289, + "kl": 0.114013671875, + "learning_rate": 9.643333333333334e-07, + "loss": 0.0045, + "reward": 1.4375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 535 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.25, + "epoch": 0.07146666666666666, + "grad_norm": 5.0000786781311035, + "kl": 0.13330078125, + "learning_rate": 9.642666666666666e-07, + "loss": 0.0053, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 536 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.75, + "epoch": 0.0716, + "grad_norm": 7.383081436157227, + "kl": 0.19189453125, + "learning_rate": 9.641999999999998e-07, + "loss": 0.0077, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 537 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.4375, + "epoch": 0.07173333333333333, + "grad_norm": 10.353839874267578, + "kl": 0.1533203125, + "learning_rate": 9.641333333333332e-07, + "loss": 0.0061, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 538 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.4375, + "epoch": 0.07186666666666666, + "grad_norm": 6.7472310066223145, + "kl": 0.119384765625, + "learning_rate": 9.640666666666666e-07, + "loss": 0.0048, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 539 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.0, + "epoch": 0.072, + "grad_norm": 13.677891731262207, + "kl": 0.104736328125, + "learning_rate": 9.64e-07, + "loss": 0.0042, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 540 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.3125, + "epoch": 0.07213333333333333, + "grad_norm": 11.001093864440918, + "kl": 0.22509765625, + "learning_rate": 9.639333333333332e-07, + "loss": 0.009, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 541 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.6875, + "epoch": 0.07226666666666667, + "grad_norm": 34.559326171875, + "kl": 0.098876953125, + "learning_rate": 9.638666666666666e-07, + "loss": 0.004, + "reward": 0.9375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.875, + "step": 542 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.75, + "epoch": 0.0724, + "grad_norm": 7.757643699645996, + "kl": 0.2119140625, + "learning_rate": 9.638e-07, + "loss": 0.0085, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 543 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.4375, + "epoch": 0.07253333333333334, + "grad_norm": 7.087191104888916, + "kl": 0.2177734375, + "learning_rate": 9.637333333333334e-07, + "loss": 0.0087, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 544 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.0, + "epoch": 0.07266666666666667, + "grad_norm": 11.824114799499512, + "kl": 0.12939453125, + "learning_rate": 9.636666666666666e-07, + "loss": 0.0052, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 545 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.125, + "epoch": 0.0728, + "grad_norm": 6.569311141967773, + "kl": 0.190673828125, + "learning_rate": 9.636e-07, + "loss": 0.0076, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 546 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.0, + "epoch": 0.07293333333333334, + "grad_norm": 16.450868606567383, + "kl": 0.20068359375, + "learning_rate": 9.635333333333334e-07, + "loss": 0.008, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 547 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.9375, + "epoch": 0.07306666666666667, + "grad_norm": 9.047338485717773, + "kl": 0.135009765625, + "learning_rate": 9.634666666666666e-07, + "loss": 0.0054, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 548 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.25, + "epoch": 0.0732, + "grad_norm": 6.890579700469971, + "kl": 0.11669921875, + "learning_rate": 9.634e-07, + "loss": 0.0047, + "reward": 1.3125, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 549 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.3125, + "epoch": 0.07333333333333333, + "grad_norm": 6.948051929473877, + "kl": 0.14501953125, + "learning_rate": 9.633333333333334e-07, + "loss": 0.0058, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 550 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.0, + "epoch": 0.07346666666666667, + "grad_norm": 10.095874786376953, + "kl": 0.2109375, + "learning_rate": 9.632666666666668e-07, + "loss": 0.0085, + "reward": 1.375, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 551 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.75, + "epoch": 0.0736, + "grad_norm": 9.598273277282715, + "kl": 0.15771484375, + "learning_rate": 9.632e-07, + "loss": 0.0063, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 552 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.9375, + "epoch": 0.07373333333333333, + "grad_norm": 12.861584663391113, + "kl": 0.29150390625, + "learning_rate": 9.631333333333333e-07, + "loss": 0.0117, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 553 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.6875, + "epoch": 0.07386666666666666, + "grad_norm": 22.780933380126953, + "kl": 0.4951171875, + "learning_rate": 9.630666666666665e-07, + "loss": 0.0198, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 554 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.3125, + "epoch": 0.074, + "grad_norm": 7.440788269042969, + "kl": 0.10205078125, + "learning_rate": 9.63e-07, + "loss": 0.0041, + "reward": 1.5, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 555 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.9375, + "epoch": 0.07413333333333333, + "grad_norm": 8.675019264221191, + "kl": 0.15869140625, + "learning_rate": 9.629333333333333e-07, + "loss": 0.0063, + "reward": 1.25, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.8125, + "step": 556 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.75, + "epoch": 0.07426666666666666, + "grad_norm": 8.196403503417969, + "kl": 0.185546875, + "learning_rate": 9.628666666666665e-07, + "loss": 0.0074, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 557 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.75, + "epoch": 0.0744, + "grad_norm": 5.36301851272583, + "kl": 0.1259765625, + "learning_rate": 9.628e-07, + "loss": 0.0051, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 558 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.3125, + "epoch": 0.07453333333333333, + "grad_norm": 9.544254302978516, + "kl": 0.173828125, + "learning_rate": 9.627333333333333e-07, + "loss": 0.0069, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 559 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.5625, + "epoch": 0.07466666666666667, + "grad_norm": 7.682737827301025, + "kl": 0.14453125, + "learning_rate": 9.626666666666667e-07, + "loss": 0.0058, + "reward": 1.375, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 560 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.75, + "epoch": 0.0748, + "grad_norm": 7.483381748199463, + "kl": 0.104736328125, + "learning_rate": 9.626e-07, + "loss": 0.0042, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 561 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.9375, + "epoch": 0.07493333333333334, + "grad_norm": 9.866740226745605, + "kl": 0.203125, + "learning_rate": 9.625333333333333e-07, + "loss": 0.0081, + "reward": 1.125, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 562 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.25, + "epoch": 0.07506666666666667, + "grad_norm": 12.355622291564941, + "kl": 0.20361328125, + "learning_rate": 9.624666666666667e-07, + "loss": 0.0081, + "reward": 1.625, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 563 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.875, + "epoch": 0.0752, + "grad_norm": 0.3621254563331604, + "kl": 0.115966796875, + "learning_rate": 9.624e-07, + "loss": 0.0046, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 564 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.875, + "epoch": 0.07533333333333334, + "grad_norm": 6.2519612312316895, + "kl": 0.1875, + "learning_rate": 9.623333333333333e-07, + "loss": 0.0075, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 565 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.1875, + "epoch": 0.07546666666666667, + "grad_norm": 9.664374351501465, + "kl": 0.1826171875, + "learning_rate": 9.622666666666667e-07, + "loss": 0.0073, + "reward": 1.1875, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 566 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.625, + "epoch": 0.0756, + "grad_norm": 38.73763656616211, + "kl": 0.1650390625, + "learning_rate": 9.622e-07, + "loss": 0.0066, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 567 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.0, + "epoch": 0.07573333333333333, + "grad_norm": 8.713264465332031, + "kl": 0.2978515625, + "learning_rate": 9.621333333333333e-07, + "loss": 0.0119, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 568 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.5, + "epoch": 0.07586666666666667, + "grad_norm": 20.329023361206055, + "kl": 0.26123046875, + "learning_rate": 9.620666666666667e-07, + "loss": 0.0105, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 569 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.9375, + "epoch": 0.076, + "grad_norm": 10.912956237792969, + "kl": 0.1328125, + "learning_rate": 9.619999999999999e-07, + "loss": 0.0053, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 570 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.25, + "epoch": 0.07613333333333333, + "grad_norm": 10.553218841552734, + "kl": 0.23583984375, + "learning_rate": 9.619333333333333e-07, + "loss": 0.0094, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 571 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.3125, + "epoch": 0.07626666666666666, + "grad_norm": 5.746993064880371, + "kl": 0.1845703125, + "learning_rate": 9.618666666666667e-07, + "loss": 0.0074, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 572 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.375, + "epoch": 0.0764, + "grad_norm": 9.003890037536621, + "kl": 0.17236328125, + "learning_rate": 9.618e-07, + "loss": 0.0069, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 573 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.625, + "epoch": 0.07653333333333333, + "grad_norm": 20.834745407104492, + "kl": 0.17236328125, + "learning_rate": 9.617333333333332e-07, + "loss": 0.0069, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 574 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.5, + "epoch": 0.07666666666666666, + "grad_norm": 12.931807518005371, + "kl": 0.23828125, + "learning_rate": 9.616666666666666e-07, + "loss": 0.0095, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 575 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.6875, + "epoch": 0.0768, + "grad_norm": 8.805492401123047, + "kl": 0.17578125, + "learning_rate": 9.616e-07, + "loss": 0.007, + "reward": 1.5625, + "reward_std": 0.7499763667583466, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 576 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.5, + "epoch": 0.07693333333333334, + "grad_norm": 10.625080108642578, + "kl": 0.1826171875, + "learning_rate": 9.615333333333332e-07, + "loss": 0.0073, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 577 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.9375, + "epoch": 0.07706666666666667, + "grad_norm": 11.116710662841797, + "kl": 0.2177734375, + "learning_rate": 9.614666666666666e-07, + "loss": 0.0087, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 578 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.625, + "epoch": 0.0772, + "grad_norm": 6.440462589263916, + "kl": 0.134765625, + "learning_rate": 9.614e-07, + "loss": 0.0054, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 579 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.25, + "epoch": 0.07733333333333334, + "grad_norm": 5.263782501220703, + "kl": 0.11669921875, + "learning_rate": 9.613333333333334e-07, + "loss": 0.0047, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 580 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.0625, + "epoch": 0.07746666666666667, + "grad_norm": 8.556984901428223, + "kl": 0.1435546875, + "learning_rate": 9.612666666666666e-07, + "loss": 0.0057, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 581 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.5, + "epoch": 0.0776, + "grad_norm": 0.5458760857582092, + "kl": 0.1982421875, + "learning_rate": 9.612e-07, + "loss": 0.0079, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 582 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.1875, + "epoch": 0.07773333333333333, + "grad_norm": 3.295121669769287, + "kl": 0.10595703125, + "learning_rate": 9.611333333333332e-07, + "loss": 0.0042, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 583 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.625, + "epoch": 0.07786666666666667, + "grad_norm": 8.650521278381348, + "kl": 0.127197265625, + "learning_rate": 9.610666666666666e-07, + "loss": 0.0051, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 584 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.5, + "epoch": 0.078, + "grad_norm": 9.768342971801758, + "kl": 0.17578125, + "learning_rate": 9.61e-07, + "loss": 0.007, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 585 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.25, + "epoch": 0.07813333333333333, + "grad_norm": 8.557844161987305, + "kl": 0.118896484375, + "learning_rate": 9.609333333333332e-07, + "loss": 0.0048, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 586 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.3125, + "epoch": 0.07826666666666666, + "grad_norm": 6.692380905151367, + "kl": 0.24462890625, + "learning_rate": 9.608666666666666e-07, + "loss": 0.0098, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 587 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.3125, + "epoch": 0.0784, + "grad_norm": 8.855413436889648, + "kl": 0.13623046875, + "learning_rate": 9.608e-07, + "loss": 0.0054, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 588 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.8125, + "epoch": 0.07853333333333333, + "grad_norm": 12.185615539550781, + "kl": 0.1455078125, + "learning_rate": 9.607333333333334e-07, + "loss": 0.0058, + "reward": 1.1875, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 589 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.9375, + "epoch": 0.07866666666666666, + "grad_norm": 10.081452369689941, + "kl": 0.2080078125, + "learning_rate": 9.606666666666666e-07, + "loss": 0.0083, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 590 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.875, + "epoch": 0.0788, + "grad_norm": 9.857898712158203, + "kl": 0.19677734375, + "learning_rate": 9.606e-07, + "loss": 0.0079, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 591 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.25, + "epoch": 0.07893333333333333, + "grad_norm": 9.656088829040527, + "kl": 0.1796875, + "learning_rate": 9.605333333333334e-07, + "loss": 0.0072, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 592 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.5, + "epoch": 0.07906666666666666, + "grad_norm": 7.250821113586426, + "kl": 0.127685546875, + "learning_rate": 9.604666666666665e-07, + "loss": 0.0051, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 593 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.4375, + "epoch": 0.0792, + "grad_norm": 13.448006629943848, + "kl": 0.28955078125, + "learning_rate": 9.604e-07, + "loss": 0.0116, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 594 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.3125, + "epoch": 0.07933333333333334, + "grad_norm": 9.728422164916992, + "kl": 0.18115234375, + "learning_rate": 9.603333333333333e-07, + "loss": 0.0073, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 595 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.625, + "epoch": 0.07946666666666667, + "grad_norm": 8.965285301208496, + "kl": 0.171875, + "learning_rate": 9.602666666666667e-07, + "loss": 0.0069, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 596 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.375, + "epoch": 0.0796, + "grad_norm": 11.567111015319824, + "kl": 0.24853515625, + "learning_rate": 9.602e-07, + "loss": 0.01, + "reward": 1.4375, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 597 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.9375, + "epoch": 0.07973333333333334, + "grad_norm": 4.7951788902282715, + "kl": 0.21533203125, + "learning_rate": 9.601333333333333e-07, + "loss": 0.0086, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 598 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.375, + "epoch": 0.07986666666666667, + "grad_norm": 6.96920919418335, + "kl": 0.259765625, + "learning_rate": 9.600666666666665e-07, + "loss": 0.0104, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 599 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.3125, + "epoch": 0.08, + "grad_norm": 10.779230117797852, + "kl": 0.1513671875, + "learning_rate": 9.6e-07, + "loss": 0.0061, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 600 + }, + { + "clip_ratio": 0.0, + "completion_length": 46.3125, + "epoch": 0.08013333333333333, + "grad_norm": 12.751134872436523, + "kl": 0.361328125, + "learning_rate": 9.599333333333333e-07, + "loss": 0.0144, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 601 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.6875, + "epoch": 0.08026666666666667, + "grad_norm": 8.141751289367676, + "kl": 0.455078125, + "learning_rate": 9.598666666666665e-07, + "loss": 0.0182, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 602 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.8125, + "epoch": 0.0804, + "grad_norm": 11.511007308959961, + "kl": 0.22119140625, + "learning_rate": 9.598e-07, + "loss": 0.0089, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 603 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.125, + "epoch": 0.08053333333333333, + "grad_norm": 9.858654022216797, + "kl": 0.1875, + "learning_rate": 9.597333333333333e-07, + "loss": 0.0075, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 604 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5625, + "epoch": 0.08066666666666666, + "grad_norm": 1.0845675468444824, + "kl": 0.29150390625, + "learning_rate": 9.596666666666667e-07, + "loss": 0.0117, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 605 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.375, + "epoch": 0.0808, + "grad_norm": 10.805809020996094, + "kl": 0.37109375, + "learning_rate": 9.595999999999999e-07, + "loss": 0.0148, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 606 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.375, + "epoch": 0.08093333333333333, + "grad_norm": 11.949896812438965, + "kl": 0.3037109375, + "learning_rate": 9.595333333333333e-07, + "loss": 0.0122, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 607 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.5, + "epoch": 0.08106666666666666, + "grad_norm": 10.481163024902344, + "kl": 0.376953125, + "learning_rate": 9.594666666666667e-07, + "loss": 0.0151, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 608 + }, + { + "clip_ratio": 0.0, + "completion_length": 48.8125, + "epoch": 0.0812, + "grad_norm": 1.3636586666107178, + "kl": 0.3291015625, + "learning_rate": 9.594e-07, + "loss": 0.0132, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 609 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.8125, + "epoch": 0.08133333333333333, + "grad_norm": 1.381439447402954, + "kl": 0.4228515625, + "learning_rate": 9.593333333333333e-07, + "loss": 0.0169, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 610 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.5625, + "epoch": 0.08146666666666667, + "grad_norm": 5.443018436431885, + "kl": 0.19091796875, + "learning_rate": 9.592666666666667e-07, + "loss": 0.0076, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 611 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.5, + "epoch": 0.0816, + "grad_norm": 6.046571731567383, + "kl": 0.1923828125, + "learning_rate": 9.592e-07, + "loss": 0.0077, + "reward": 1.1875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 612 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.75, + "epoch": 0.08173333333333334, + "grad_norm": 11.043769836425781, + "kl": 0.18115234375, + "learning_rate": 9.591333333333333e-07, + "loss": 0.0072, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 613 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.5625, + "epoch": 0.08186666666666667, + "grad_norm": 7.730051040649414, + "kl": 0.1806640625, + "learning_rate": 9.590666666666667e-07, + "loss": 0.0072, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 614 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.4375, + "epoch": 0.082, + "grad_norm": 8.522453308105469, + "kl": 0.17626953125, + "learning_rate": 9.589999999999998e-07, + "loss": 0.007, + "reward": 1.6875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 615 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.8125, + "epoch": 0.08213333333333334, + "grad_norm": 29.88450813293457, + "kl": 0.1669921875, + "learning_rate": 9.589333333333332e-07, + "loss": 0.0067, + "reward": 1.6875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 616 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.3125, + "epoch": 0.08226666666666667, + "grad_norm": 8.313015937805176, + "kl": 0.19482421875, + "learning_rate": 9.588666666666666e-07, + "loss": 0.0078, + "reward": 1.6875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 617 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.0625, + "epoch": 0.0824, + "grad_norm": 11.750452041625977, + "kl": 0.169189453125, + "learning_rate": 9.588e-07, + "loss": 0.0068, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 618 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.4375, + "epoch": 0.08253333333333333, + "grad_norm": 5.533456802368164, + "kl": 0.115478515625, + "learning_rate": 9.587333333333332e-07, + "loss": 0.0046, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 619 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.5625, + "epoch": 0.08266666666666667, + "grad_norm": 5.07309103012085, + "kl": 0.1123046875, + "learning_rate": 9.586666666666666e-07, + "loss": 0.0045, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 620 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.9375, + "epoch": 0.0828, + "grad_norm": 8.042708396911621, + "kl": 0.098876953125, + "learning_rate": 9.586e-07, + "loss": 0.004, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 621 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.8125, + "epoch": 0.08293333333333333, + "grad_norm": 8.501932144165039, + "kl": 0.206787109375, + "learning_rate": 9.585333333333332e-07, + "loss": 0.0082, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 622 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.6875, + "epoch": 0.08306666666666666, + "grad_norm": 4.287995338439941, + "kl": 0.1318359375, + "learning_rate": 9.584666666666666e-07, + "loss": 0.0053, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 623 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.0625, + "epoch": 0.0832, + "grad_norm": 10.096467971801758, + "kl": 0.1767578125, + "learning_rate": 9.584e-07, + "loss": 0.0071, + "reward": 1.4375, + "reward_std": 0.7499763667583466, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 624 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.75, + "epoch": 0.08333333333333333, + "grad_norm": 8.549686431884766, + "kl": 0.14990234375, + "learning_rate": 9.583333333333334e-07, + "loss": 0.006, + "reward": 1.1875, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 625 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.625, + "epoch": 0.08346666666666666, + "grad_norm": 4.14879035949707, + "kl": 0.1533203125, + "learning_rate": 9.582666666666666e-07, + "loss": 0.0061, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 626 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.8125, + "epoch": 0.0836, + "grad_norm": 9.571572303771973, + "kl": 0.16259765625, + "learning_rate": 9.582e-07, + "loss": 0.0065, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 627 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.0625, + "epoch": 0.08373333333333334, + "grad_norm": 8.61742115020752, + "kl": 0.118408203125, + "learning_rate": 9.581333333333332e-07, + "loss": 0.0047, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 628 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.3125, + "epoch": 0.08386666666666667, + "grad_norm": 9.840831756591797, + "kl": 0.21875, + "learning_rate": 9.580666666666666e-07, + "loss": 0.0088, + "reward": 1.3125, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 629 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.5625, + "epoch": 0.084, + "grad_norm": 5.004394054412842, + "kl": 0.1552734375, + "learning_rate": 9.58e-07, + "loss": 0.0062, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 630 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.4375, + "epoch": 0.08413333333333334, + "grad_norm": 8.319250106811523, + "kl": 0.107666015625, + "learning_rate": 9.579333333333332e-07, + "loss": 0.0043, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 631 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.375, + "epoch": 0.08426666666666667, + "grad_norm": 9.950072288513184, + "kl": 0.2177734375, + "learning_rate": 9.578666666666666e-07, + "loss": 0.0087, + "reward": 1.3125, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 632 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.375, + "epoch": 0.0844, + "grad_norm": 7.9373698234558105, + "kl": 0.142578125, + "learning_rate": 9.578e-07, + "loss": 0.0057, + "reward": 1.1875, + "reward_std": 0.6199793070554733, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 633 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.5, + "epoch": 0.08453333333333334, + "grad_norm": 10.766485214233398, + "kl": 0.111328125, + "learning_rate": 9.577333333333334e-07, + "loss": 0.0045, + "reward": 1.5625, + "reward_std": 0.6895177364349365, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 634 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.5, + "epoch": 0.08466666666666667, + "grad_norm": 6.549779891967773, + "kl": 0.1435546875, + "learning_rate": 9.576666666666665e-07, + "loss": 0.0057, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.8125, + "step": 635 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.1875, + "epoch": 0.0848, + "grad_norm": 10.308985710144043, + "kl": 0.15283203125, + "learning_rate": 9.576e-07, + "loss": 0.0061, + "reward": 1.4375, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 636 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.4375, + "epoch": 0.08493333333333333, + "grad_norm": 13.686258316040039, + "kl": 0.128662109375, + "learning_rate": 9.575333333333333e-07, + "loss": 0.0051, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 637 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.5625, + "epoch": 0.08506666666666667, + "grad_norm": 6.766195774078369, + "kl": 0.116455078125, + "learning_rate": 9.574666666666667e-07, + "loss": 0.0047, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 638 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.75, + "epoch": 0.0852, + "grad_norm": 126.77899932861328, + "kl": 4.2470703125, + "learning_rate": 9.574e-07, + "loss": 0.1693, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 639 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.875, + "epoch": 0.08533333333333333, + "grad_norm": 10.562539100646973, + "kl": 0.15625, + "learning_rate": 9.573333333333333e-07, + "loss": 0.0063, + "reward": 1.0625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 640 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.875, + "epoch": 0.08546666666666666, + "grad_norm": 5.4314656257629395, + "kl": 0.1552734375, + "learning_rate": 9.572666666666667e-07, + "loss": 0.0062, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 641 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.125, + "epoch": 0.0856, + "grad_norm": 7.937184810638428, + "kl": 0.1337890625, + "learning_rate": 9.572e-07, + "loss": 0.0054, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 642 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.125, + "epoch": 0.08573333333333333, + "grad_norm": 7.983271598815918, + "kl": 0.14990234375, + "learning_rate": 9.571333333333333e-07, + "loss": 0.006, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 643 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.0625, + "epoch": 0.08586666666666666, + "grad_norm": 7.130800247192383, + "kl": 0.122314453125, + "learning_rate": 9.570666666666665e-07, + "loss": 0.0049, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 644 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.5625, + "epoch": 0.086, + "grad_norm": 153.51991271972656, + "kl": 0.159912109375, + "learning_rate": 9.57e-07, + "loss": 0.0064, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 645 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.4375, + "epoch": 0.08613333333333334, + "grad_norm": 0.43031811714172363, + "kl": 0.11865234375, + "learning_rate": 9.569333333333333e-07, + "loss": 0.0047, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 646 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.5625, + "epoch": 0.08626666666666667, + "grad_norm": 8.92037582397461, + "kl": 0.11767578125, + "learning_rate": 9.568666666666667e-07, + "loss": 0.0047, + "reward": 1.25, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 647 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.6875, + "epoch": 0.0864, + "grad_norm": 8.90683364868164, + "kl": 0.16259765625, + "learning_rate": 9.567999999999999e-07, + "loss": 0.0065, + "reward": 1.125, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 648 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.625, + "epoch": 0.08653333333333334, + "grad_norm": 5.241005897521973, + "kl": 0.162353515625, + "learning_rate": 9.567333333333333e-07, + "loss": 0.0065, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 649 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.0, + "epoch": 0.08666666666666667, + "grad_norm": 9.780768394470215, + "kl": 0.21923828125, + "learning_rate": 9.566666666666667e-07, + "loss": 0.0088, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 650 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.875, + "epoch": 0.0868, + "grad_norm": 8.050829887390137, + "kl": 0.1640625, + "learning_rate": 9.565999999999999e-07, + "loss": 0.0066, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 651 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.4375, + "epoch": 0.08693333333333333, + "grad_norm": 0.7883680462837219, + "kl": 0.1923828125, + "learning_rate": 9.565333333333333e-07, + "loss": 0.0077, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 652 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.5625, + "epoch": 0.08706666666666667, + "grad_norm": 10.890473365783691, + "kl": 0.18994140625, + "learning_rate": 9.564666666666667e-07, + "loss": 0.0076, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 653 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.75, + "epoch": 0.0872, + "grad_norm": 7.666045665740967, + "kl": 0.2705078125, + "learning_rate": 9.564e-07, + "loss": 0.0108, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 654 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.625, + "epoch": 0.08733333333333333, + "grad_norm": 11.583446502685547, + "kl": 0.20751953125, + "learning_rate": 9.563333333333333e-07, + "loss": 0.0083, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 655 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.125, + "epoch": 0.08746666666666666, + "grad_norm": 14.119894027709961, + "kl": 0.11083984375, + "learning_rate": 9.562666666666667e-07, + "loss": 0.0044, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 656 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.0, + "epoch": 0.0876, + "grad_norm": 5.775589942932129, + "kl": 0.135498046875, + "learning_rate": 9.562e-07, + "loss": 0.0054, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 657 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.9375, + "epoch": 0.08773333333333333, + "grad_norm": 10.275223731994629, + "kl": 0.22509765625, + "learning_rate": 9.561333333333332e-07, + "loss": 0.009, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 658 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.1875, + "epoch": 0.08786666666666666, + "grad_norm": 6.933783054351807, + "kl": 0.16796875, + "learning_rate": 9.560666666666666e-07, + "loss": 0.0067, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 659 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.375, + "epoch": 0.088, + "grad_norm": 6.922656536102295, + "kl": 0.1474609375, + "learning_rate": 9.559999999999998e-07, + "loss": 0.0059, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 660 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.625, + "epoch": 0.08813333333333333, + "grad_norm": 5.487739086151123, + "kl": 0.109375, + "learning_rate": 9.559333333333332e-07, + "loss": 0.0044, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 661 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.1875, + "epoch": 0.08826666666666666, + "grad_norm": 8.709589958190918, + "kl": 0.140380859375, + "learning_rate": 9.558666666666666e-07, + "loss": 0.0056, + "reward": 1.1875, + "reward_std": 0.6739883720874786, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 662 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.5625, + "epoch": 0.0884, + "grad_norm": 12.350615501403809, + "kl": 0.287109375, + "learning_rate": 9.558e-07, + "loss": 0.0115, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 663 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.0625, + "epoch": 0.08853333333333334, + "grad_norm": 15.66772747039795, + "kl": 0.1787109375, + "learning_rate": 9.557333333333332e-07, + "loss": 0.0072, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 664 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.6875, + "epoch": 0.08866666666666667, + "grad_norm": 44.34178161621094, + "kl": 0.20068359375, + "learning_rate": 9.556666666666666e-07, + "loss": 0.008, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 665 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.9375, + "epoch": 0.0888, + "grad_norm": 5.940773963928223, + "kl": 0.19287109375, + "learning_rate": 9.556e-07, + "loss": 0.0077, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 666 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.25, + "epoch": 0.08893333333333334, + "grad_norm": 6.962421417236328, + "kl": 0.12451171875, + "learning_rate": 9.555333333333334e-07, + "loss": 0.005, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.8125, + "step": 667 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.125, + "epoch": 0.08906666666666667, + "grad_norm": 8.16552734375, + "kl": 0.17578125, + "learning_rate": 9.554666666666666e-07, + "loss": 0.007, + "reward": 1.25, + "reward_std": 0.579209566116333, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 668 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.375, + "epoch": 0.0892, + "grad_norm": 4.786959171295166, + "kl": 0.14453125, + "learning_rate": 9.554e-07, + "loss": 0.0058, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 669 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.75, + "epoch": 0.08933333333333333, + "grad_norm": 7.1277241706848145, + "kl": 0.142578125, + "learning_rate": 9.553333333333334e-07, + "loss": 0.0057, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 670 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.1875, + "epoch": 0.08946666666666667, + "grad_norm": 11.057467460632324, + "kl": 0.18896484375, + "learning_rate": 9.552666666666666e-07, + "loss": 0.0076, + "reward": 1.25, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 671 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.5625, + "epoch": 0.0896, + "grad_norm": 4.304596424102783, + "kl": 0.15087890625, + "learning_rate": 9.552e-07, + "loss": 0.006, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 672 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.1875, + "epoch": 0.08973333333333333, + "grad_norm": 6.534228801727295, + "kl": 0.1796875, + "learning_rate": 9.551333333333332e-07, + "loss": 0.0072, + "reward": 1.375, + "reward_std": 0.579209566116333, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 673 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.4375, + "epoch": 0.08986666666666666, + "grad_norm": 5.636871814727783, + "kl": 0.125, + "learning_rate": 9.550666666666666e-07, + "loss": 0.005, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 674 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.625, + "epoch": 0.09, + "grad_norm": 10.25495433807373, + "kl": 0.1396484375, + "learning_rate": 9.55e-07, + "loss": 0.0056, + "reward": 1.6875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 675 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.5, + "epoch": 0.09013333333333333, + "grad_norm": 5.620697975158691, + "kl": 0.205078125, + "learning_rate": 9.549333333333334e-07, + "loss": 0.0082, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 676 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.625, + "epoch": 0.09026666666666666, + "grad_norm": 8.27102279663086, + "kl": 0.096435546875, + "learning_rate": 9.548666666666665e-07, + "loss": 0.0039, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 677 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.9375, + "epoch": 0.0904, + "grad_norm": 7.954347133636475, + "kl": 0.179443359375, + "learning_rate": 9.548e-07, + "loss": 0.0072, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 678 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.4375, + "epoch": 0.09053333333333333, + "grad_norm": 6.221362113952637, + "kl": 0.14404296875, + "learning_rate": 9.547333333333333e-07, + "loss": 0.0058, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 679 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.6875, + "epoch": 0.09066666666666667, + "grad_norm": 11.248605728149414, + "kl": 0.21923828125, + "learning_rate": 9.546666666666665e-07, + "loss": 0.0088, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 680 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.4375, + "epoch": 0.0908, + "grad_norm": 13.854522705078125, + "kl": 0.1552734375, + "learning_rate": 9.546e-07, + "loss": 0.0062, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 681 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.6875, + "epoch": 0.09093333333333334, + "grad_norm": 13.62419605255127, + "kl": 0.15771484375, + "learning_rate": 9.545333333333333e-07, + "loss": 0.0063, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 682 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.1875, + "epoch": 0.09106666666666667, + "grad_norm": 4.429844856262207, + "kl": 0.1396484375, + "learning_rate": 9.544666666666667e-07, + "loss": 0.0056, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 683 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.6875, + "epoch": 0.0912, + "grad_norm": 10.376134872436523, + "kl": 0.14990234375, + "learning_rate": 9.544e-07, + "loss": 0.006, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 684 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.625, + "epoch": 0.09133333333333334, + "grad_norm": 0.37626299262046814, + "kl": 0.1513671875, + "learning_rate": 9.543333333333333e-07, + "loss": 0.0061, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 685 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.3125, + "epoch": 0.09146666666666667, + "grad_norm": 9.530667304992676, + "kl": 0.1494140625, + "learning_rate": 9.542666666666667e-07, + "loss": 0.006, + "reward": 1.0625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 686 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.0, + "epoch": 0.0916, + "grad_norm": 10.406678199768066, + "kl": 0.2255859375, + "learning_rate": 9.542e-07, + "loss": 0.0091, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 687 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.375, + "epoch": 0.09173333333333333, + "grad_norm": 0.48532021045684814, + "kl": 0.151611328125, + "learning_rate": 9.541333333333333e-07, + "loss": 0.0061, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 688 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.875, + "epoch": 0.09186666666666667, + "grad_norm": 11.045458793640137, + "kl": 0.18701171875, + "learning_rate": 9.540666666666665e-07, + "loss": 0.0075, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 689 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.0, + "epoch": 0.092, + "grad_norm": 9.525378227233887, + "kl": 0.201904296875, + "learning_rate": 9.539999999999999e-07, + "loss": 0.0081, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 690 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.1875, + "epoch": 0.09213333333333333, + "grad_norm": 8.436893463134766, + "kl": 0.1376953125, + "learning_rate": 9.539333333333333e-07, + "loss": 0.0055, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 691 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.5, + "epoch": 0.09226666666666666, + "grad_norm": 6.660350322723389, + "kl": 0.14111328125, + "learning_rate": 9.538666666666667e-07, + "loss": 0.0056, + "reward": 1.25, + "reward_std": 0.6924468874931335, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 692 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.0, + "epoch": 0.0924, + "grad_norm": 16.111494064331055, + "kl": 0.13720703125, + "learning_rate": 9.537999999999999e-07, + "loss": 0.0055, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 693 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.0625, + "epoch": 0.09253333333333333, + "grad_norm": 5.299132347106934, + "kl": 0.1591796875, + "learning_rate": 9.537333333333333e-07, + "loss": 0.0064, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 694 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.125, + "epoch": 0.09266666666666666, + "grad_norm": 84.8128433227539, + "kl": 0.14453125, + "learning_rate": 9.536666666666667e-07, + "loss": 0.0058, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 695 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.3125, + "epoch": 0.0928, + "grad_norm": 11.261372566223145, + "kl": 0.12939453125, + "learning_rate": 9.536e-07, + "loss": 0.0052, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 696 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.875, + "epoch": 0.09293333333333334, + "grad_norm": 6.267222881317139, + "kl": 0.15185546875, + "learning_rate": 9.535333333333333e-07, + "loss": 0.0061, + "reward": 1.0625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 697 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.9375, + "epoch": 0.09306666666666667, + "grad_norm": 25.05236053466797, + "kl": 0.113525390625, + "learning_rate": 9.534666666666667e-07, + "loss": 0.0045, + "reward": 1.125, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 698 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.25, + "epoch": 0.0932, + "grad_norm": 12.837738037109375, + "kl": 0.146484375, + "learning_rate": 9.534e-07, + "loss": 0.0059, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 699 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.8125, + "epoch": 0.09333333333333334, + "grad_norm": 9.199910163879395, + "kl": 0.1650390625, + "learning_rate": 9.533333333333333e-07, + "loss": 0.0066, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 700 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.9375, + "epoch": 0.09346666666666667, + "grad_norm": 16.601720809936523, + "kl": 0.16455078125, + "learning_rate": 9.532666666666666e-07, + "loss": 0.0066, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 701 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.375, + "epoch": 0.0936, + "grad_norm": 9.510900497436523, + "kl": 0.244140625, + "learning_rate": 9.532e-07, + "loss": 0.0098, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 702 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.375, + "epoch": 0.09373333333333334, + "grad_norm": 40.891990661621094, + "kl": 1.25146484375, + "learning_rate": 9.531333333333333e-07, + "loss": 0.0502, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 703 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.25, + "epoch": 0.09386666666666667, + "grad_norm": 4.823275089263916, + "kl": 0.12158203125, + "learning_rate": 9.530666666666666e-07, + "loss": 0.0049, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 704 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.75, + "epoch": 0.094, + "grad_norm": 6.978858470916748, + "kl": 0.18115234375, + "learning_rate": 9.529999999999999e-07, + "loss": 0.0072, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 705 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.25, + "epoch": 0.09413333333333333, + "grad_norm": 7.044653415679932, + "kl": 0.148681640625, + "learning_rate": 9.529333333333332e-07, + "loss": 0.0059, + "reward": 1.5625, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 706 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.125, + "epoch": 0.09426666666666667, + "grad_norm": 8.284242630004883, + "kl": 0.13427734375, + "learning_rate": 9.528666666666666e-07, + "loss": 0.0054, + "reward": 1.4375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 707 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.125, + "epoch": 0.0944, + "grad_norm": 5.185154914855957, + "kl": 0.25537109375, + "learning_rate": 9.527999999999999e-07, + "loss": 0.0102, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 708 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.1875, + "epoch": 0.09453333333333333, + "grad_norm": 7.697679042816162, + "kl": 0.17333984375, + "learning_rate": 9.527333333333333e-07, + "loss": 0.0069, + "reward": 1.4375, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 709 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.625, + "epoch": 0.09466666666666666, + "grad_norm": 6.738422393798828, + "kl": 0.11572265625, + "learning_rate": 9.526666666666666e-07, + "loss": 0.0046, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 710 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.5625, + "epoch": 0.0948, + "grad_norm": 6.775939464569092, + "kl": 0.1328125, + "learning_rate": 9.526e-07, + "loss": 0.0053, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 711 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.8125, + "epoch": 0.09493333333333333, + "grad_norm": 12.294835090637207, + "kl": 0.1953125, + "learning_rate": 9.525333333333333e-07, + "loss": 0.0078, + "reward": 1.4375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 712 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.0625, + "epoch": 0.09506666666666666, + "grad_norm": 10.881726264953613, + "kl": 0.109619140625, + "learning_rate": 9.524666666666667e-07, + "loss": 0.0044, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 713 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.4375, + "epoch": 0.0952, + "grad_norm": 5.472227573394775, + "kl": 0.13818359375, + "learning_rate": 9.524e-07, + "loss": 0.0055, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 714 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.5, + "epoch": 0.09533333333333334, + "grad_norm": 11.295037269592285, + "kl": 0.1689453125, + "learning_rate": 9.523333333333333e-07, + "loss": 0.0068, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 715 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.5625, + "epoch": 0.09546666666666667, + "grad_norm": 7.535458564758301, + "kl": 0.125, + "learning_rate": 9.522666666666667e-07, + "loss": 0.005, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 716 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.0625, + "epoch": 0.0956, + "grad_norm": 10.187446594238281, + "kl": 0.22412109375, + "learning_rate": 9.522e-07, + "loss": 0.009, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 717 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.4375, + "epoch": 0.09573333333333334, + "grad_norm": 4.439114570617676, + "kl": 0.166015625, + "learning_rate": 9.521333333333334e-07, + "loss": 0.0066, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 718 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.1875, + "epoch": 0.09586666666666667, + "grad_norm": 7.069751739501953, + "kl": 0.20068359375, + "learning_rate": 9.520666666666665e-07, + "loss": 0.008, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 719 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.75, + "epoch": 0.096, + "grad_norm": 5.992896556854248, + "kl": 0.29443359375, + "learning_rate": 9.52e-07, + "loss": 0.0117, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 720 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.75, + "epoch": 0.09613333333333333, + "grad_norm": 7.542893886566162, + "kl": 0.12353515625, + "learning_rate": 9.519333333333332e-07, + "loss": 0.0049, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 721 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.5625, + "epoch": 0.09626666666666667, + "grad_norm": 17.415485382080078, + "kl": 0.7177734375, + "learning_rate": 9.518666666666666e-07, + "loss": 0.0287, + "reward": 1.0625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 722 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.875, + "epoch": 0.0964, + "grad_norm": 10.445457458496094, + "kl": 0.1884765625, + "learning_rate": 9.517999999999999e-07, + "loss": 0.0075, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 723 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.4375, + "epoch": 0.09653333333333333, + "grad_norm": 10.756857872009277, + "kl": 0.205078125, + "learning_rate": 9.517333333333332e-07, + "loss": 0.0082, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 724 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.8125, + "epoch": 0.09666666666666666, + "grad_norm": 18.781190872192383, + "kl": 1.2646484375, + "learning_rate": 9.516666666666666e-07, + "loss": 0.0503, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.8125, + "step": 725 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.875, + "epoch": 0.0968, + "grad_norm": 4.615570068359375, + "kl": 0.1240234375, + "learning_rate": 9.515999999999999e-07, + "loss": 0.005, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 726 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.75, + "epoch": 0.09693333333333333, + "grad_norm": 5.842894554138184, + "kl": 0.2138671875, + "learning_rate": 9.515333333333333e-07, + "loss": 0.0085, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 727 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.25, + "epoch": 0.09706666666666666, + "grad_norm": 10.779583930969238, + "kl": 0.2138671875, + "learning_rate": 9.514666666666666e-07, + "loss": 0.0086, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 728 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.25, + "epoch": 0.0972, + "grad_norm": 10.744233131408691, + "kl": 0.127197265625, + "learning_rate": 9.514e-07, + "loss": 0.0051, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 729 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.75, + "epoch": 0.09733333333333333, + "grad_norm": 1.0759063959121704, + "kl": 0.232421875, + "learning_rate": 9.513333333333333e-07, + "loss": 0.0093, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 730 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.5, + "epoch": 0.09746666666666666, + "grad_norm": 12.912086486816406, + "kl": 0.28759765625, + "learning_rate": 9.512666666666667e-07, + "loss": 0.0115, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 731 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.5, + "epoch": 0.0976, + "grad_norm": 10.578949928283691, + "kl": 0.2216796875, + "learning_rate": 9.512e-07, + "loss": 0.0089, + "reward": 1.1875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 732 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.4375, + "epoch": 0.09773333333333334, + "grad_norm": 7.231508731842041, + "kl": 0.16943359375, + "learning_rate": 9.511333333333334e-07, + "loss": 0.0068, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 733 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.1875, + "epoch": 0.09786666666666667, + "grad_norm": 21.439809799194336, + "kl": 0.21533203125, + "learning_rate": 9.510666666666666e-07, + "loss": 0.0086, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 734 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.875, + "epoch": 0.098, + "grad_norm": 11.920452117919922, + "kl": 0.17333984375, + "learning_rate": 9.509999999999999e-07, + "loss": 0.0069, + "reward": 1.0625, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 735 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.4375, + "epoch": 0.09813333333333334, + "grad_norm": 18.38294792175293, + "kl": 0.287109375, + "learning_rate": 9.509333333333333e-07, + "loss": 0.0115, + "reward": 1.3125, + "reward_std": 0.7833450436592102, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.8125, + "step": 736 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.625, + "epoch": 0.09826666666666667, + "grad_norm": 10.96186351776123, + "kl": 0.23388671875, + "learning_rate": 9.508666666666666e-07, + "loss": 0.0094, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 737 + }, + { + "clip_ratio": 0.0, + "completion_length": 53.8125, + "epoch": 0.0984, + "grad_norm": 8.900819778442383, + "kl": 0.2802734375, + "learning_rate": 9.508e-07, + "loss": 0.0112, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 738 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.4375, + "epoch": 0.09853333333333333, + "grad_norm": 9.328983306884766, + "kl": 0.1435546875, + "learning_rate": 9.507333333333333e-07, + "loss": 0.0057, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 739 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.75, + "epoch": 0.09866666666666667, + "grad_norm": 6.3391008377075195, + "kl": 0.15966796875, + "learning_rate": 9.506666666666667e-07, + "loss": 0.0064, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 740 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.4375, + "epoch": 0.0988, + "grad_norm": 7.16552209854126, + "kl": 0.22802734375, + "learning_rate": 9.506e-07, + "loss": 0.0091, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 741 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.5, + "epoch": 0.09893333333333333, + "grad_norm": 8.36678695678711, + "kl": 0.4619140625, + "learning_rate": 9.505333333333333e-07, + "loss": 0.0185, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 742 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.375, + "epoch": 0.09906666666666666, + "grad_norm": 65.99493408203125, + "kl": 0.22216796875, + "learning_rate": 9.504666666666666e-07, + "loss": 0.0089, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 743 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.9375, + "epoch": 0.0992, + "grad_norm": 7.484498500823975, + "kl": 0.2333984375, + "learning_rate": 9.503999999999999e-07, + "loss": 0.0094, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 744 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.625, + "epoch": 0.09933333333333333, + "grad_norm": 7.937389373779297, + "kl": 0.265625, + "learning_rate": 9.503333333333333e-07, + "loss": 0.0107, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 745 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.125, + "epoch": 0.09946666666666666, + "grad_norm": 7.607924938201904, + "kl": 0.23974609375, + "learning_rate": 9.502666666666666e-07, + "loss": 0.0096, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 746 + }, + { + "clip_ratio": 0.0, + "completion_length": 52.5625, + "epoch": 0.0996, + "grad_norm": 19.695220947265625, + "kl": 0.4541015625, + "learning_rate": 9.502e-07, + "loss": 0.0182, + "reward": 1.5625, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 747 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.5, + "epoch": 0.09973333333333333, + "grad_norm": 40.30133819580078, + "kl": 0.77099609375, + "learning_rate": 9.501333333333333e-07, + "loss": 0.0309, + "reward": 1.4375, + "reward_std": 0.7499763667583466, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 748 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.9375, + "epoch": 0.09986666666666667, + "grad_norm": 6.312315940856934, + "kl": 0.24169921875, + "learning_rate": 9.500666666666666e-07, + "loss": 0.0097, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 749 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.5, + "epoch": 0.1, + "grad_norm": 11.48764419555664, + "kl": 0.4384765625, + "learning_rate": 9.499999999999999e-07, + "loss": 0.0176, + "reward": 1.3125, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 750 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.75, + "epoch": 0.10013333333333334, + "grad_norm": 9.404945373535156, + "kl": 0.2880859375, + "learning_rate": 9.499333333333333e-07, + "loss": 0.0115, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 751 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.5, + "epoch": 0.10026666666666667, + "grad_norm": 6.094201564788818, + "kl": 0.31103515625, + "learning_rate": 9.498666666666666e-07, + "loss": 0.0125, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 752 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.0625, + "epoch": 0.1004, + "grad_norm": 13.841280937194824, + "kl": 0.3330078125, + "learning_rate": 9.497999999999999e-07, + "loss": 0.0133, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 753 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.8125, + "epoch": 0.10053333333333334, + "grad_norm": 12.861478805541992, + "kl": 0.224609375, + "learning_rate": 9.497333333333333e-07, + "loss": 0.009, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 754 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.25, + "epoch": 0.10066666666666667, + "grad_norm": 1.2965315580368042, + "kl": 0.3486328125, + "learning_rate": 9.496666666666666e-07, + "loss": 0.014, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 755 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.125, + "epoch": 0.1008, + "grad_norm": 8.398758888244629, + "kl": 0.17919921875, + "learning_rate": 9.496e-07, + "loss": 0.0072, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 756 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.75, + "epoch": 0.10093333333333333, + "grad_norm": 8.697480201721191, + "kl": 0.27490234375, + "learning_rate": 9.495333333333333e-07, + "loss": 0.011, + "reward": 1.0, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 757 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.125, + "epoch": 0.10106666666666667, + "grad_norm": 4.601747512817383, + "kl": 0.20751953125, + "learning_rate": 9.494666666666667e-07, + "loss": 0.0083, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 758 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.6875, + "epoch": 0.1012, + "grad_norm": 7.835274696350098, + "kl": 0.3095703125, + "learning_rate": 9.494e-07, + "loss": 0.0124, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 759 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.0, + "epoch": 0.10133333333333333, + "grad_norm": 9.4596586227417, + "kl": 0.13818359375, + "learning_rate": 9.493333333333334e-07, + "loss": 0.0055, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 760 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.1875, + "epoch": 0.10146666666666666, + "grad_norm": 8.46574878692627, + "kl": 0.15478515625, + "learning_rate": 9.492666666666667e-07, + "loss": 0.0062, + "reward": 1.375, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 761 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.9375, + "epoch": 0.1016, + "grad_norm": 11.597868919372559, + "kl": 0.181640625, + "learning_rate": 9.492e-07, + "loss": 0.0073, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 762 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.625, + "epoch": 0.10173333333333333, + "grad_norm": 9.605172157287598, + "kl": 0.2607421875, + "learning_rate": 9.491333333333333e-07, + "loss": 0.0104, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 763 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.8125, + "epoch": 0.10186666666666666, + "grad_norm": 11.157912254333496, + "kl": 0.1728515625, + "learning_rate": 9.490666666666665e-07, + "loss": 0.0069, + "reward": 1.5, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 764 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.75, + "epoch": 0.102, + "grad_norm": 9.27524185180664, + "kl": 0.21728515625, + "learning_rate": 9.489999999999999e-07, + "loss": 0.0087, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 765 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.5625, + "epoch": 0.10213333333333334, + "grad_norm": 9.377603530883789, + "kl": 0.1357421875, + "learning_rate": 9.489333333333332e-07, + "loss": 0.0054, + "reward": 1.375, + "reward_std": 0.579209566116333, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 766 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.6875, + "epoch": 0.10226666666666667, + "grad_norm": 8.881782531738281, + "kl": 0.1669921875, + "learning_rate": 9.488666666666666e-07, + "loss": 0.0067, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 767 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.25, + "epoch": 0.1024, + "grad_norm": 41.580875396728516, + "kl": 0.2177734375, + "learning_rate": 9.487999999999999e-07, + "loss": 0.0087, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 768 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.375, + "epoch": 0.10253333333333334, + "grad_norm": 6.549374103546143, + "kl": 0.19921875, + "learning_rate": 9.487333333333333e-07, + "loss": 0.008, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 769 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.5, + "epoch": 0.10266666666666667, + "grad_norm": 23.751462936401367, + "kl": 0.1845703125, + "learning_rate": 9.486666666666666e-07, + "loss": 0.0074, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 770 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.8125, + "epoch": 0.1028, + "grad_norm": 6.939698696136475, + "kl": 0.1484375, + "learning_rate": 9.485999999999999e-07, + "loss": 0.0059, + "reward": 1.125, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 771 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.1875, + "epoch": 0.10293333333333334, + "grad_norm": 10.688098907470703, + "kl": 0.18310546875, + "learning_rate": 9.485333333333333e-07, + "loss": 0.0073, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 772 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.8125, + "epoch": 0.10306666666666667, + "grad_norm": 7.326903820037842, + "kl": 0.12255859375, + "learning_rate": 9.484666666666666e-07, + "loss": 0.0049, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 773 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.75, + "epoch": 0.1032, + "grad_norm": 11.75161075592041, + "kl": 0.154541015625, + "learning_rate": 9.484e-07, + "loss": 0.0062, + "reward": 1.5, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 774 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.3125, + "epoch": 0.10333333333333333, + "grad_norm": 54.46729278564453, + "kl": 0.131103515625, + "learning_rate": 9.483333333333333e-07, + "loss": 0.0052, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 775 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.9375, + "epoch": 0.10346666666666667, + "grad_norm": 16.988195419311523, + "kl": 0.196533203125, + "learning_rate": 9.482666666666667e-07, + "loss": 0.0079, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 776 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.25, + "epoch": 0.1036, + "grad_norm": 12.067306518554688, + "kl": 0.1904296875, + "learning_rate": 9.482e-07, + "loss": 0.0076, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 777 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.1875, + "epoch": 0.10373333333333333, + "grad_norm": 10.080941200256348, + "kl": 0.1767578125, + "learning_rate": 9.481333333333334e-07, + "loss": 0.0071, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 778 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.1875, + "epoch": 0.10386666666666666, + "grad_norm": 15.384099960327148, + "kl": 0.163818359375, + "learning_rate": 9.480666666666666e-07, + "loss": 0.0066, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 779 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.1875, + "epoch": 0.104, + "grad_norm": 13.50665283203125, + "kl": 0.18115234375, + "learning_rate": 9.479999999999999e-07, + "loss": 0.0072, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 780 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.0, + "epoch": 0.10413333333333333, + "grad_norm": 13.772222518920898, + "kl": 0.25244140625, + "learning_rate": 9.479333333333333e-07, + "loss": 0.0101, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 781 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.5625, + "epoch": 0.10426666666666666, + "grad_norm": 16.62538719177246, + "kl": 0.2255859375, + "learning_rate": 9.478666666666666e-07, + "loss": 0.009, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 782 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.8125, + "epoch": 0.1044, + "grad_norm": 12.947239875793457, + "kl": 0.20458984375, + "learning_rate": 9.478e-07, + "loss": 0.0082, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 783 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.1875, + "epoch": 0.10453333333333334, + "grad_norm": 11.330567359924316, + "kl": 0.18310546875, + "learning_rate": 9.477333333333332e-07, + "loss": 0.0073, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 784 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.4375, + "epoch": 0.10466666666666667, + "grad_norm": 7.875192165374756, + "kl": 0.24951171875, + "learning_rate": 9.476666666666666e-07, + "loss": 0.01, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 785 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.1875, + "epoch": 0.1048, + "grad_norm": 7.649266719818115, + "kl": 0.139404296875, + "learning_rate": 9.475999999999999e-07, + "loss": 0.0056, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 786 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.0625, + "epoch": 0.10493333333333334, + "grad_norm": 11.982501983642578, + "kl": 0.19580078125, + "learning_rate": 9.475333333333333e-07, + "loss": 0.0078, + "reward": 1.375, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 787 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.1875, + "epoch": 0.10506666666666667, + "grad_norm": 8.730623245239258, + "kl": 0.126953125, + "learning_rate": 9.474666666666666e-07, + "loss": 0.0051, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 788 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.0, + "epoch": 0.1052, + "grad_norm": 16.07733154296875, + "kl": 0.4931640625, + "learning_rate": 9.474e-07, + "loss": 0.0198, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 789 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.0, + "epoch": 0.10533333333333333, + "grad_norm": 10.319522857666016, + "kl": 0.2412109375, + "learning_rate": 9.473333333333333e-07, + "loss": 0.0096, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 790 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.6875, + "epoch": 0.10546666666666667, + "grad_norm": 9.709027290344238, + "kl": 0.224609375, + "learning_rate": 9.472666666666666e-07, + "loss": 0.009, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 791 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.5625, + "epoch": 0.1056, + "grad_norm": 8.560288429260254, + "kl": 0.16943359375, + "learning_rate": 9.472e-07, + "loss": 0.0068, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 792 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.375, + "epoch": 0.10573333333333333, + "grad_norm": 6.623203754425049, + "kl": 0.220703125, + "learning_rate": 9.471333333333333e-07, + "loss": 0.0088, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 793 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.0, + "epoch": 0.10586666666666666, + "grad_norm": 9.530043601989746, + "kl": 0.13720703125, + "learning_rate": 9.470666666666666e-07, + "loss": 0.0055, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 794 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.75, + "epoch": 0.106, + "grad_norm": 8.884490966796875, + "kl": 0.1591796875, + "learning_rate": 9.469999999999999e-07, + "loss": 0.0064, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 795 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.5625, + "epoch": 0.10613333333333333, + "grad_norm": 7.858128070831299, + "kl": 0.1337890625, + "learning_rate": 9.469333333333333e-07, + "loss": 0.0054, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 796 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.875, + "epoch": 0.10626666666666666, + "grad_norm": 8.238045692443848, + "kl": 0.143798828125, + "learning_rate": 9.468666666666666e-07, + "loss": 0.0058, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 797 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.3125, + "epoch": 0.1064, + "grad_norm": 13.09151840209961, + "kl": 0.1630859375, + "learning_rate": 9.468e-07, + "loss": 0.0065, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 798 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.3125, + "epoch": 0.10653333333333333, + "grad_norm": 9.284591674804688, + "kl": 0.17919921875, + "learning_rate": 9.467333333333333e-07, + "loss": 0.0072, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 799 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.375, + "epoch": 0.10666666666666667, + "grad_norm": 57.528282165527344, + "kl": 0.2529296875, + "learning_rate": 9.466666666666666e-07, + "loss": 0.0101, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 800 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.25, + "epoch": 0.1068, + "grad_norm": 6.88197135925293, + "kl": 0.2041015625, + "learning_rate": 9.466e-07, + "loss": 0.0082, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 801 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.625, + "epoch": 0.10693333333333334, + "grad_norm": 9.977322578430176, + "kl": 0.30615234375, + "learning_rate": 9.465333333333333e-07, + "loss": 0.0122, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 802 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.8125, + "epoch": 0.10706666666666667, + "grad_norm": 8.792282104492188, + "kl": 0.1982421875, + "learning_rate": 9.464666666666667e-07, + "loss": 0.0079, + "reward": 1.0625, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 803 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.3125, + "epoch": 0.1072, + "grad_norm": 11.42349624633789, + "kl": 0.16259765625, + "learning_rate": 9.464e-07, + "loss": 0.0065, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 804 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.0, + "epoch": 0.10733333333333334, + "grad_norm": 10.709067344665527, + "kl": 0.171875, + "learning_rate": 9.463333333333334e-07, + "loss": 0.0069, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 805 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.5625, + "epoch": 0.10746666666666667, + "grad_norm": 9.49846363067627, + "kl": 0.17041015625, + "learning_rate": 9.462666666666666e-07, + "loss": 0.0068, + "reward": 1.3125, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 806 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.75, + "epoch": 0.1076, + "grad_norm": 11.733687400817871, + "kl": 0.1474609375, + "learning_rate": 9.462e-07, + "loss": 0.0059, + "reward": 1.5625, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 807 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.0625, + "epoch": 0.10773333333333333, + "grad_norm": 10.714509963989258, + "kl": 0.2060546875, + "learning_rate": 9.461333333333333e-07, + "loss": 0.0083, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 808 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.0, + "epoch": 0.10786666666666667, + "grad_norm": 7.741424083709717, + "kl": 0.1689453125, + "learning_rate": 9.460666666666665e-07, + "loss": 0.0067, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 809 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.625, + "epoch": 0.108, + "grad_norm": 10.103740692138672, + "kl": 0.172607421875, + "learning_rate": 9.459999999999999e-07, + "loss": 0.0069, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 810 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.9375, + "epoch": 0.10813333333333333, + "grad_norm": 5.315855026245117, + "kl": 0.20263671875, + "learning_rate": 9.459333333333332e-07, + "loss": 0.0081, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 811 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.5625, + "epoch": 0.10826666666666666, + "grad_norm": 8.804181098937988, + "kl": 0.18017578125, + "learning_rate": 9.458666666666666e-07, + "loss": 0.0072, + "reward": 1.3125, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 812 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.25, + "epoch": 0.1084, + "grad_norm": 5.734143257141113, + "kl": 0.22607421875, + "learning_rate": 9.457999999999999e-07, + "loss": 0.0091, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 813 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.0, + "epoch": 0.10853333333333333, + "grad_norm": 9.308348655700684, + "kl": 0.193359375, + "learning_rate": 9.457333333333333e-07, + "loss": 0.0077, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 814 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.0, + "epoch": 0.10866666666666666, + "grad_norm": 7.976053714752197, + "kl": 0.2822265625, + "learning_rate": 9.456666666666666e-07, + "loss": 0.0113, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 815 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.875, + "epoch": 0.1088, + "grad_norm": 8.893139839172363, + "kl": 0.1767578125, + "learning_rate": 9.456e-07, + "loss": 0.0071, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 816 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.625, + "epoch": 0.10893333333333333, + "grad_norm": 11.062081336975098, + "kl": 0.18212890625, + "learning_rate": 9.455333333333333e-07, + "loss": 0.0073, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.875, + "step": 817 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.125, + "epoch": 0.10906666666666667, + "grad_norm": 8.939202308654785, + "kl": 0.17578125, + "learning_rate": 9.454666666666666e-07, + "loss": 0.007, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 818 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.625, + "epoch": 0.1092, + "grad_norm": 8.345919609069824, + "kl": 0.14111328125, + "learning_rate": 9.454e-07, + "loss": 0.0056, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 819 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.0625, + "epoch": 0.10933333333333334, + "grad_norm": 6.359933376312256, + "kl": 0.1396484375, + "learning_rate": 9.453333333333333e-07, + "loss": 0.0056, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 820 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.0, + "epoch": 0.10946666666666667, + "grad_norm": 8.155006408691406, + "kl": 0.14501953125, + "learning_rate": 9.452666666666667e-07, + "loss": 0.0058, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 821 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.875, + "epoch": 0.1096, + "grad_norm": 5.772094249725342, + "kl": 0.134521484375, + "learning_rate": 9.452e-07, + "loss": 0.0054, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 822 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.3125, + "epoch": 0.10973333333333334, + "grad_norm": 9.425529479980469, + "kl": 0.1376953125, + "learning_rate": 9.451333333333334e-07, + "loss": 0.0055, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 823 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.125, + "epoch": 0.10986666666666667, + "grad_norm": 9.711377143859863, + "kl": 0.13916015625, + "learning_rate": 9.450666666666667e-07, + "loss": 0.0056, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 824 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.1875, + "epoch": 0.11, + "grad_norm": 0.5598104000091553, + "kl": 0.185546875, + "learning_rate": 9.45e-07, + "loss": 0.0074, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 825 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.875, + "epoch": 0.11013333333333333, + "grad_norm": 7.433315753936768, + "kl": 0.134033203125, + "learning_rate": 9.449333333333332e-07, + "loss": 0.0054, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 826 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.25, + "epoch": 0.11026666666666667, + "grad_norm": 9.606620788574219, + "kl": 0.14111328125, + "learning_rate": 9.448666666666665e-07, + "loss": 0.0056, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 827 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.8125, + "epoch": 0.1104, + "grad_norm": 9.493745803833008, + "kl": 0.134033203125, + "learning_rate": 9.447999999999999e-07, + "loss": 0.0053, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 828 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.25, + "epoch": 0.11053333333333333, + "grad_norm": 0.5072369575500488, + "kl": 0.17919921875, + "learning_rate": 9.447333333333332e-07, + "loss": 0.0072, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 829 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.875, + "epoch": 0.11066666666666666, + "grad_norm": 7.673058032989502, + "kl": 0.13037109375, + "learning_rate": 9.446666666666666e-07, + "loss": 0.0052, + "reward": 1.1875, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 830 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.5625, + "epoch": 0.1108, + "grad_norm": 0.3333415687084198, + "kl": 0.177734375, + "learning_rate": 9.445999999999999e-07, + "loss": 0.0071, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 831 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.0, + "epoch": 0.11093333333333333, + "grad_norm": 4.61536169052124, + "kl": 0.225341796875, + "learning_rate": 9.445333333333333e-07, + "loss": 0.009, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 832 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.375, + "epoch": 0.11106666666666666, + "grad_norm": 7.553878307342529, + "kl": 0.1162109375, + "learning_rate": 9.444666666666666e-07, + "loss": 0.0047, + "reward": 1.3125, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 833 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.0, + "epoch": 0.1112, + "grad_norm": 6.909963130950928, + "kl": 0.14501953125, + "learning_rate": 9.444e-07, + "loss": 0.0058, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 834 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.9375, + "epoch": 0.11133333333333334, + "grad_norm": 9.825883865356445, + "kl": 0.14599609375, + "learning_rate": 9.443333333333333e-07, + "loss": 0.0058, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 835 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.6875, + "epoch": 0.11146666666666667, + "grad_norm": 6.746296405792236, + "kl": 0.21142578125, + "learning_rate": 9.442666666666667e-07, + "loss": 0.0085, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 836 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.625, + "epoch": 0.1116, + "grad_norm": 7.428668975830078, + "kl": 0.12353515625, + "learning_rate": 9.442e-07, + "loss": 0.0049, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 837 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.375, + "epoch": 0.11173333333333334, + "grad_norm": 8.729207992553711, + "kl": 0.13232421875, + "learning_rate": 9.441333333333333e-07, + "loss": 0.0053, + "reward": 1.4375, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 838 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.9375, + "epoch": 0.11186666666666667, + "grad_norm": 7.660860061645508, + "kl": 0.13330078125, + "learning_rate": 9.440666666666667e-07, + "loss": 0.0053, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 839 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.5, + "epoch": 0.112, + "grad_norm": 26.727035522460938, + "kl": 0.14208984375, + "learning_rate": 9.439999999999999e-07, + "loss": 0.0057, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 840 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.5625, + "epoch": 0.11213333333333333, + "grad_norm": 8.328272819519043, + "kl": 0.1513671875, + "learning_rate": 9.439333333333333e-07, + "loss": 0.006, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 841 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.875, + "epoch": 0.11226666666666667, + "grad_norm": 4.3440351486206055, + "kl": 0.141845703125, + "learning_rate": 9.438666666666666e-07, + "loss": 0.0057, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 842 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.875, + "epoch": 0.1124, + "grad_norm": 13.151805877685547, + "kl": 0.19580078125, + "learning_rate": 9.438e-07, + "loss": 0.0078, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 843 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.875, + "epoch": 0.11253333333333333, + "grad_norm": 9.353602409362793, + "kl": 0.14697265625, + "learning_rate": 9.437333333333333e-07, + "loss": 0.0059, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 844 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.0, + "epoch": 0.11266666666666666, + "grad_norm": 12.451237678527832, + "kl": 0.13330078125, + "learning_rate": 9.436666666666667e-07, + "loss": 0.0053, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 845 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.5, + "epoch": 0.1128, + "grad_norm": 7.204890727996826, + "kl": 0.1708984375, + "learning_rate": 9.436e-07, + "loss": 0.0068, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 846 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.9375, + "epoch": 0.11293333333333333, + "grad_norm": 6.821323394775391, + "kl": 0.11767578125, + "learning_rate": 9.435333333333332e-07, + "loss": 0.0047, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 847 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.8125, + "epoch": 0.11306666666666666, + "grad_norm": 6.884734630584717, + "kl": 0.120361328125, + "learning_rate": 9.434666666666666e-07, + "loss": 0.0048, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 848 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.1875, + "epoch": 0.1132, + "grad_norm": 5.271696090698242, + "kl": 0.2177734375, + "learning_rate": 9.433999999999999e-07, + "loss": 0.0087, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 849 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.1875, + "epoch": 0.11333333333333333, + "grad_norm": 8.42371654510498, + "kl": 0.11083984375, + "learning_rate": 9.433333333333333e-07, + "loss": 0.0044, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 850 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.625, + "epoch": 0.11346666666666666, + "grad_norm": 8.772198677062988, + "kl": 0.107421875, + "learning_rate": 9.432666666666666e-07, + "loss": 0.0043, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 851 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.3125, + "epoch": 0.1136, + "grad_norm": 16.181671142578125, + "kl": 0.14208984375, + "learning_rate": 9.432e-07, + "loss": 0.0057, + "reward": 1.4375, + "reward_std": 0.6487165093421936, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 852 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.5, + "epoch": 0.11373333333333334, + "grad_norm": 7.915638446807861, + "kl": 0.23095703125, + "learning_rate": 9.431333333333333e-07, + "loss": 0.0092, + "reward": 1.3125, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 853 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.125, + "epoch": 0.11386666666666667, + "grad_norm": 5.715921878814697, + "kl": 0.13818359375, + "learning_rate": 9.430666666666667e-07, + "loss": 0.0055, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 854 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.375, + "epoch": 0.114, + "grad_norm": 7.063955307006836, + "kl": 0.19384765625, + "learning_rate": 9.429999999999999e-07, + "loss": 0.0077, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 855 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.4375, + "epoch": 0.11413333333333334, + "grad_norm": 21.164045333862305, + "kl": 0.16015625, + "learning_rate": 9.429333333333332e-07, + "loss": 0.0064, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 856 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.4375, + "epoch": 0.11426666666666667, + "grad_norm": 3.9605977535247803, + "kl": 0.162841796875, + "learning_rate": 9.428666666666666e-07, + "loss": 0.0065, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 857 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.5625, + "epoch": 0.1144, + "grad_norm": 7.817022323608398, + "kl": 0.119873046875, + "learning_rate": 9.427999999999999e-07, + "loss": 0.0048, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 858 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.375, + "epoch": 0.11453333333333333, + "grad_norm": 9.15217399597168, + "kl": 0.14013671875, + "learning_rate": 9.427333333333333e-07, + "loss": 0.0056, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 859 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.8125, + "epoch": 0.11466666666666667, + "grad_norm": 9.73939323425293, + "kl": 0.15576171875, + "learning_rate": 9.426666666666666e-07, + "loss": 0.0062, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 860 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.0, + "epoch": 0.1148, + "grad_norm": 15.919635772705078, + "kl": 0.23828125, + "learning_rate": 9.426e-07, + "loss": 0.0095, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 861 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.8125, + "epoch": 0.11493333333333333, + "grad_norm": 6.085958003997803, + "kl": 0.21337890625, + "learning_rate": 9.425333333333333e-07, + "loss": 0.0085, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 862 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.875, + "epoch": 0.11506666666666666, + "grad_norm": 9.476007461547852, + "kl": 0.33740234375, + "learning_rate": 9.424666666666667e-07, + "loss": 0.0135, + "reward": 1.25, + "reward_std": 0.579209566116333, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 863 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.625, + "epoch": 0.1152, + "grad_norm": 5.306690692901611, + "kl": 0.21630859375, + "learning_rate": 9.424e-07, + "loss": 0.0087, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 864 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.375, + "epoch": 0.11533333333333333, + "grad_norm": 7.605253219604492, + "kl": 0.136474609375, + "learning_rate": 9.423333333333333e-07, + "loss": 0.0054, + "reward": 1.625, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 865 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.4375, + "epoch": 0.11546666666666666, + "grad_norm": 7.204874515533447, + "kl": 0.2158203125, + "learning_rate": 9.422666666666667e-07, + "loss": 0.0087, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 866 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.0625, + "epoch": 0.1156, + "grad_norm": 17.684133529663086, + "kl": 0.135009765625, + "learning_rate": 9.422e-07, + "loss": 0.0054, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 867 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.0625, + "epoch": 0.11573333333333333, + "grad_norm": 0.6236329674720764, + "kl": 0.18896484375, + "learning_rate": 9.421333333333334e-07, + "loss": 0.0076, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 868 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.75, + "epoch": 0.11586666666666667, + "grad_norm": 0.37047383189201355, + "kl": 0.193359375, + "learning_rate": 9.420666666666666e-07, + "loss": 0.0077, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 869 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.1875, + "epoch": 0.116, + "grad_norm": 11.309540748596191, + "kl": 0.171875, + "learning_rate": 9.419999999999999e-07, + "loss": 0.0069, + "reward": 1.6875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 870 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.375, + "epoch": 0.11613333333333334, + "grad_norm": 9.374924659729004, + "kl": 0.2314453125, + "learning_rate": 9.419333333333332e-07, + "loss": 0.0092, + "reward": 1.375, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 871 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.4375, + "epoch": 0.11626666666666667, + "grad_norm": 6.563712120056152, + "kl": 0.1533203125, + "learning_rate": 9.418666666666666e-07, + "loss": 0.0061, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 872 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.0625, + "epoch": 0.1164, + "grad_norm": 4.017935752868652, + "kl": 0.17138671875, + "learning_rate": 9.417999999999999e-07, + "loss": 0.0069, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 873 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.875, + "epoch": 0.11653333333333334, + "grad_norm": 8.004398345947266, + "kl": 0.1611328125, + "learning_rate": 9.417333333333332e-07, + "loss": 0.0064, + "reward": 1.4375, + "reward_std": 0.6487165093421936, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 874 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.875, + "epoch": 0.11666666666666667, + "grad_norm": 8.349335670471191, + "kl": 0.1640625, + "learning_rate": 9.416666666666666e-07, + "loss": 0.0066, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 875 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.5625, + "epoch": 0.1168, + "grad_norm": 8.078995704650879, + "kl": 0.13671875, + "learning_rate": 9.415999999999999e-07, + "loss": 0.0055, + "reward": 1.1875, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 876 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.75, + "epoch": 0.11693333333333333, + "grad_norm": 206.5321044921875, + "kl": 0.1357421875, + "learning_rate": 9.415333333333333e-07, + "loss": 0.0054, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 877 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.1875, + "epoch": 0.11706666666666667, + "grad_norm": 1.7370468378067017, + "kl": 0.2119140625, + "learning_rate": 9.414666666666666e-07, + "loss": 0.0085, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 878 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.4375, + "epoch": 0.1172, + "grad_norm": 10.97011661529541, + "kl": 0.13818359375, + "learning_rate": 9.414e-07, + "loss": 0.0055, + "reward": 1.625, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 879 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.75, + "epoch": 0.11733333333333333, + "grad_norm": 46.62332534790039, + "kl": 0.154296875, + "learning_rate": 9.413333333333333e-07, + "loss": 0.0062, + "reward": 1.5, + "reward_std": 0.6452257037162781, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 880 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.5625, + "epoch": 0.11746666666666666, + "grad_norm": 11.559188842773438, + "kl": 0.1552734375, + "learning_rate": 9.412666666666667e-07, + "loss": 0.0062, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 881 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.625, + "epoch": 0.1176, + "grad_norm": 6.779002666473389, + "kl": 0.15869140625, + "learning_rate": 9.412e-07, + "loss": 0.0064, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 882 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.9375, + "epoch": 0.11773333333333333, + "grad_norm": 17.55160903930664, + "kl": 0.14990234375, + "learning_rate": 9.411333333333334e-07, + "loss": 0.006, + "reward": 1.5, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 883 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.3125, + "epoch": 0.11786666666666666, + "grad_norm": 4.433633327484131, + "kl": 0.14990234375, + "learning_rate": 9.410666666666667e-07, + "loss": 0.006, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 884 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.875, + "epoch": 0.118, + "grad_norm": 5.093143939971924, + "kl": 0.175537109375, + "learning_rate": 9.409999999999999e-07, + "loss": 0.007, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 885 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.9375, + "epoch": 0.11813333333333334, + "grad_norm": 3.25696063041687, + "kl": 0.146728515625, + "learning_rate": 9.409333333333333e-07, + "loss": 0.0059, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 886 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.6875, + "epoch": 0.11826666666666667, + "grad_norm": 5.310414791107178, + "kl": 0.15185546875, + "learning_rate": 9.408666666666666e-07, + "loss": 0.0061, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 887 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.125, + "epoch": 0.1184, + "grad_norm": 5.381213188171387, + "kl": 0.14599609375, + "learning_rate": 9.408e-07, + "loss": 0.0059, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 888 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.6875, + "epoch": 0.11853333333333334, + "grad_norm": 9.651161193847656, + "kl": 0.24560546875, + "learning_rate": 9.407333333333332e-07, + "loss": 0.0098, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.8125, + "step": 889 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.9375, + "epoch": 0.11866666666666667, + "grad_norm": 6.1203813552856445, + "kl": 0.13525390625, + "learning_rate": 9.406666666666666e-07, + "loss": 0.0054, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 890 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.875, + "epoch": 0.1188, + "grad_norm": 5.523767948150635, + "kl": 0.09716796875, + "learning_rate": 9.405999999999999e-07, + "loss": 0.0039, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 891 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.3125, + "epoch": 0.11893333333333334, + "grad_norm": 2.9161863327026367, + "kl": 0.23046875, + "learning_rate": 9.405333333333333e-07, + "loss": 0.0092, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 892 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.625, + "epoch": 0.11906666666666667, + "grad_norm": 8.986708641052246, + "kl": 0.14794921875, + "learning_rate": 9.404666666666666e-07, + "loss": 0.0059, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 893 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.75, + "epoch": 0.1192, + "grad_norm": 9.579798698425293, + "kl": 0.19580078125, + "learning_rate": 9.403999999999999e-07, + "loss": 0.0078, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 894 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.5625, + "epoch": 0.11933333333333333, + "grad_norm": 10.377392768859863, + "kl": 0.184814453125, + "learning_rate": 9.403333333333333e-07, + "loss": 0.0074, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 895 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.5625, + "epoch": 0.11946666666666667, + "grad_norm": 5.869200229644775, + "kl": 0.28076171875, + "learning_rate": 9.402666666666666e-07, + "loss": 0.0112, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 896 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.0, + "epoch": 0.1196, + "grad_norm": 4.527641296386719, + "kl": 0.17041015625, + "learning_rate": 9.402e-07, + "loss": 0.0068, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 897 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.125, + "epoch": 0.11973333333333333, + "grad_norm": 10.41123104095459, + "kl": 0.29296875, + "learning_rate": 9.401333333333333e-07, + "loss": 0.0117, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 898 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.0625, + "epoch": 0.11986666666666666, + "grad_norm": 8.323046684265137, + "kl": 0.212890625, + "learning_rate": 9.400666666666667e-07, + "loss": 0.0085, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 899 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.0625, + "epoch": 0.12, + "grad_norm": 6.1327338218688965, + "kl": 0.19189453125, + "learning_rate": 9.399999999999999e-07, + "loss": 0.0077, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 900 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.4375, + "epoch": 0.12013333333333333, + "grad_norm": 8.622079849243164, + "kl": 0.1376953125, + "learning_rate": 9.399333333333333e-07, + "loss": 0.0055, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 901 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.4375, + "epoch": 0.12026666666666666, + "grad_norm": 54.06244659423828, + "kl": 0.13037109375, + "learning_rate": 9.398666666666666e-07, + "loss": 0.0052, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 902 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.0, + "epoch": 0.1204, + "grad_norm": 7.571897506713867, + "kl": 0.1884765625, + "learning_rate": 9.397999999999999e-07, + "loss": 0.0075, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 903 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.3125, + "epoch": 0.12053333333333334, + "grad_norm": 5.983295440673828, + "kl": 0.205078125, + "learning_rate": 9.397333333333333e-07, + "loss": 0.0082, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 904 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.5625, + "epoch": 0.12066666666666667, + "grad_norm": 6.633647441864014, + "kl": 0.23779296875, + "learning_rate": 9.396666666666666e-07, + "loss": 0.0095, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 905 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.9375, + "epoch": 0.1208, + "grad_norm": 5.945021629333496, + "kl": 0.154296875, + "learning_rate": 9.396e-07, + "loss": 0.0062, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 906 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.8125, + "epoch": 0.12093333333333334, + "grad_norm": 5.363271713256836, + "kl": 0.1943359375, + "learning_rate": 9.395333333333333e-07, + "loss": 0.0078, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 907 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.375, + "epoch": 0.12106666666666667, + "grad_norm": 7.566720008850098, + "kl": 0.21337890625, + "learning_rate": 9.394666666666667e-07, + "loss": 0.0085, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 908 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.25, + "epoch": 0.1212, + "grad_norm": 1.385768175125122, + "kl": 0.2822265625, + "learning_rate": 9.394e-07, + "loss": 0.0113, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 909 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.0625, + "epoch": 0.12133333333333333, + "grad_norm": 3.9306204319000244, + "kl": 0.15869140625, + "learning_rate": 9.393333333333334e-07, + "loss": 0.0063, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 910 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.5, + "epoch": 0.12146666666666667, + "grad_norm": 0.42105832695961, + "kl": 0.1337890625, + "learning_rate": 9.392666666666666e-07, + "loss": 0.0054, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 911 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.75, + "epoch": 0.1216, + "grad_norm": 9.037447929382324, + "kl": 0.2939453125, + "learning_rate": 9.391999999999999e-07, + "loss": 0.0118, + "reward": 1.6875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 912 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.625, + "epoch": 0.12173333333333333, + "grad_norm": 7.711023807525635, + "kl": 0.194091796875, + "learning_rate": 9.391333333333333e-07, + "loss": 0.0078, + "reward": 1.1875, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 913 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.75, + "epoch": 0.12186666666666666, + "grad_norm": 5.715243339538574, + "kl": 0.131591796875, + "learning_rate": 9.390666666666666e-07, + "loss": 0.0053, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 914 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.3125, + "epoch": 0.122, + "grad_norm": 0.6116865873336792, + "kl": 0.2109375, + "learning_rate": 9.389999999999999e-07, + "loss": 0.0084, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 915 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.4375, + "epoch": 0.12213333333333333, + "grad_norm": 6.610238552093506, + "kl": 0.18603515625, + "learning_rate": 9.389333333333332e-07, + "loss": 0.0074, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 916 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.875, + "epoch": 0.12226666666666666, + "grad_norm": 5.405671119689941, + "kl": 0.185546875, + "learning_rate": 9.388666666666666e-07, + "loss": 0.0074, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 917 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.0625, + "epoch": 0.1224, + "grad_norm": 9.788783073425293, + "kl": 0.17822265625, + "learning_rate": 9.387999999999999e-07, + "loss": 0.0071, + "reward": 1.3125, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 918 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.0625, + "epoch": 0.12253333333333333, + "grad_norm": 9.611906051635742, + "kl": 0.1533203125, + "learning_rate": 9.387333333333333e-07, + "loss": 0.0061, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 919 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.3125, + "epoch": 0.12266666666666666, + "grad_norm": 5.598555088043213, + "kl": 0.13330078125, + "learning_rate": 9.386666666666666e-07, + "loss": 0.0053, + "reward": 1.6875, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 920 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.0625, + "epoch": 0.1228, + "grad_norm": 10.655545234680176, + "kl": 0.16748046875, + "learning_rate": 9.385999999999999e-07, + "loss": 0.0067, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 921 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.9375, + "epoch": 0.12293333333333334, + "grad_norm": 6.27906608581543, + "kl": 0.18994140625, + "learning_rate": 9.385333333333333e-07, + "loss": 0.0076, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 922 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.4375, + "epoch": 0.12306666666666667, + "grad_norm": 6.643558979034424, + "kl": 0.17626953125, + "learning_rate": 9.384666666666666e-07, + "loss": 0.0071, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 923 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.4375, + "epoch": 0.1232, + "grad_norm": 7.737495422363281, + "kl": 0.22119140625, + "learning_rate": 9.384e-07, + "loss": 0.0088, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 924 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.8125, + "epoch": 0.12333333333333334, + "grad_norm": 7.073555946350098, + "kl": 0.228515625, + "learning_rate": 9.383333333333333e-07, + "loss": 0.0092, + "reward": 1.25, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.8125, + "step": 925 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.75, + "epoch": 0.12346666666666667, + "grad_norm": 5.748117446899414, + "kl": 0.19775390625, + "learning_rate": 9.382666666666667e-07, + "loss": 0.0079, + "reward": 1.1875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 926 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.9375, + "epoch": 0.1236, + "grad_norm": 5.019461154937744, + "kl": 0.21435546875, + "learning_rate": 9.382e-07, + "loss": 0.0086, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 927 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.375, + "epoch": 0.12373333333333333, + "grad_norm": 6.52340841293335, + "kl": 0.1396484375, + "learning_rate": 9.381333333333334e-07, + "loss": 0.0056, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 928 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.6875, + "epoch": 0.12386666666666667, + "grad_norm": 10.42676830291748, + "kl": 0.1669921875, + "learning_rate": 9.380666666666667e-07, + "loss": 0.0067, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 929 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.5, + "epoch": 0.124, + "grad_norm": 9.147452354431152, + "kl": 0.236328125, + "learning_rate": 9.379999999999998e-07, + "loss": 0.0094, + "reward": 1.5625, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 930 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.75, + "epoch": 0.12413333333333333, + "grad_norm": 4.155561447143555, + "kl": 0.15771484375, + "learning_rate": 9.379333333333332e-07, + "loss": 0.0063, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 931 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.875, + "epoch": 0.12426666666666666, + "grad_norm": 5.087516784667969, + "kl": 0.15234375, + "learning_rate": 9.378666666666665e-07, + "loss": 0.0061, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 932 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.3125, + "epoch": 0.1244, + "grad_norm": 4.739368438720703, + "kl": 0.15625, + "learning_rate": 9.377999999999999e-07, + "loss": 0.0062, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 933 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.5, + "epoch": 0.12453333333333333, + "grad_norm": 0.6930071115493774, + "kl": 0.203125, + "learning_rate": 9.377333333333332e-07, + "loss": 0.0081, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 934 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.4375, + "epoch": 0.12466666666666666, + "grad_norm": 7.80615758895874, + "kl": 0.28369140625, + "learning_rate": 9.376666666666666e-07, + "loss": 0.0114, + "reward": 1.0625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 935 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.75, + "epoch": 0.1248, + "grad_norm": 8.316946983337402, + "kl": 0.14794921875, + "learning_rate": 9.375999999999999e-07, + "loss": 0.0059, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 936 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.6875, + "epoch": 0.12493333333333333, + "grad_norm": 7.817391872406006, + "kl": 0.121826171875, + "learning_rate": 9.375333333333333e-07, + "loss": 0.0049, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 937 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.125, + "epoch": 0.12506666666666666, + "grad_norm": 7.152714729309082, + "kl": 0.3046875, + "learning_rate": 9.374666666666666e-07, + "loss": 0.0122, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 938 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.125, + "epoch": 0.1252, + "grad_norm": 77.39132690429688, + "kl": 0.16796875, + "learning_rate": 9.374e-07, + "loss": 0.0067, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 939 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.375, + "epoch": 0.12533333333333332, + "grad_norm": 0.5108355283737183, + "kl": 0.23876953125, + "learning_rate": 9.373333333333333e-07, + "loss": 0.0096, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 940 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.125, + "epoch": 0.12546666666666667, + "grad_norm": 6.135501384735107, + "kl": 0.1494140625, + "learning_rate": 9.372666666666666e-07, + "loss": 0.006, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 941 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.125, + "epoch": 0.1256, + "grad_norm": 7.230851650238037, + "kl": 0.2890625, + "learning_rate": 9.372e-07, + "loss": 0.0116, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 942 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.25, + "epoch": 0.12573333333333334, + "grad_norm": 9.583233833312988, + "kl": 0.24755859375, + "learning_rate": 9.371333333333333e-07, + "loss": 0.0099, + "reward": 1.1875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 943 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.75, + "epoch": 0.12586666666666665, + "grad_norm": 5.432095527648926, + "kl": 0.15478515625, + "learning_rate": 9.370666666666667e-07, + "loss": 0.0062, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 944 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.8125, + "epoch": 0.126, + "grad_norm": 6.953840732574463, + "kl": 0.3447265625, + "learning_rate": 9.37e-07, + "loss": 0.0138, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 945 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.1875, + "epoch": 0.12613333333333332, + "grad_norm": 5.145753383636475, + "kl": 0.23974609375, + "learning_rate": 9.369333333333333e-07, + "loss": 0.0096, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 946 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.0, + "epoch": 0.12626666666666667, + "grad_norm": 72.58935546875, + "kl": 0.291015625, + "learning_rate": 9.368666666666666e-07, + "loss": 0.0117, + "reward": 1.6875, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.8125, + "step": 947 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.875, + "epoch": 0.1264, + "grad_norm": 14.232046127319336, + "kl": 0.16162109375, + "learning_rate": 9.368e-07, + "loss": 0.0065, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 948 + }, + { + "clip_ratio": 0.0, + "completion_length": 53.5625, + "epoch": 0.12653333333333333, + "grad_norm": 57.12860107421875, + "kl": 2.265625, + "learning_rate": 9.367333333333333e-07, + "loss": 0.0909, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 949 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.9375, + "epoch": 0.12666666666666668, + "grad_norm": 8.947127342224121, + "kl": 0.14111328125, + "learning_rate": 9.366666666666666e-07, + "loss": 0.0056, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 950 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.1875, + "epoch": 0.1268, + "grad_norm": 15.684880256652832, + "kl": 0.1884765625, + "learning_rate": 9.366e-07, + "loss": 0.0076, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 951 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.8125, + "epoch": 0.12693333333333334, + "grad_norm": 7.485630989074707, + "kl": 0.15673828125, + "learning_rate": 9.365333333333332e-07, + "loss": 0.0063, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 952 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.125, + "epoch": 0.12706666666666666, + "grad_norm": 12.311019897460938, + "kl": 0.5029296875, + "learning_rate": 9.364666666666666e-07, + "loss": 0.0201, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 953 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.3125, + "epoch": 0.1272, + "grad_norm": 13.734447479248047, + "kl": 0.2900390625, + "learning_rate": 9.363999999999999e-07, + "loss": 0.0116, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 954 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.5, + "epoch": 0.12733333333333333, + "grad_norm": 10.22912883758545, + "kl": 0.22802734375, + "learning_rate": 9.363333333333333e-07, + "loss": 0.0091, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 955 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.4375, + "epoch": 0.12746666666666667, + "grad_norm": 10.038904190063477, + "kl": 0.2265625, + "learning_rate": 9.362666666666666e-07, + "loss": 0.0091, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 956 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.9375, + "epoch": 0.1276, + "grad_norm": 3.2229957580566406, + "kl": 0.427734375, + "learning_rate": 9.362e-07, + "loss": 0.0171, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 957 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.25, + "epoch": 0.12773333333333334, + "grad_norm": 12.554479598999023, + "kl": 0.26220703125, + "learning_rate": 9.361333333333333e-07, + "loss": 0.0105, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 958 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.1875, + "epoch": 0.12786666666666666, + "grad_norm": 15.201395034790039, + "kl": 0.408203125, + "learning_rate": 9.360666666666667e-07, + "loss": 0.0163, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 959 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.8125, + "epoch": 0.128, + "grad_norm": 48.1810188293457, + "kl": 0.33642578125, + "learning_rate": 9.36e-07, + "loss": 0.0134, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 960 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.125, + "epoch": 0.12813333333333332, + "grad_norm": 9.830523490905762, + "kl": 0.26123046875, + "learning_rate": 9.359333333333332e-07, + "loss": 0.0104, + "reward": 1.5625, + "reward_std": 0.7216846346855164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.8125, + "step": 961 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.6875, + "epoch": 0.12826666666666667, + "grad_norm": 8.260987281799316, + "kl": 0.17919921875, + "learning_rate": 9.358666666666666e-07, + "loss": 0.0072, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 962 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.3125, + "epoch": 0.1284, + "grad_norm": 9.897655487060547, + "kl": 0.40087890625, + "learning_rate": 9.357999999999999e-07, + "loss": 0.016, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 963 + }, + { + "clip_ratio": 0.0, + "completion_length": 53.75, + "epoch": 0.12853333333333333, + "grad_norm": 1.023735761642456, + "kl": 0.3115234375, + "learning_rate": 9.357333333333333e-07, + "loss": 0.0125, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 964 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.5, + "epoch": 0.12866666666666668, + "grad_norm": 6.917901515960693, + "kl": 0.302734375, + "learning_rate": 9.356666666666666e-07, + "loss": 0.0121, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 965 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.75, + "epoch": 0.1288, + "grad_norm": 13.287903785705566, + "kl": 0.41015625, + "learning_rate": 9.356e-07, + "loss": 0.0164, + "reward": 1.3125, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 966 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.1875, + "epoch": 0.12893333333333334, + "grad_norm": 11.528717994689941, + "kl": 0.3779296875, + "learning_rate": 9.355333333333333e-07, + "loss": 0.0151, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 967 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.0625, + "epoch": 0.12906666666666666, + "grad_norm": 3.8943734169006348, + "kl": 0.4287109375, + "learning_rate": 9.354666666666667e-07, + "loss": 0.0171, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 968 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.25, + "epoch": 0.1292, + "grad_norm": 7.07387638092041, + "kl": 0.404296875, + "learning_rate": 9.354e-07, + "loss": 0.0161, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 969 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.1875, + "epoch": 0.12933333333333333, + "grad_norm": 12.562474250793457, + "kl": 0.22900390625, + "learning_rate": 9.353333333333333e-07, + "loss": 0.0092, + "reward": 1.4375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 970 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.5625, + "epoch": 0.12946666666666667, + "grad_norm": 11.298347473144531, + "kl": 0.2529296875, + "learning_rate": 9.352666666666667e-07, + "loss": 0.0101, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 971 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.8125, + "epoch": 0.1296, + "grad_norm": 15.721353530883789, + "kl": 0.53515625, + "learning_rate": 9.352e-07, + "loss": 0.0214, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 972 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.5, + "epoch": 0.12973333333333334, + "grad_norm": 17.148170471191406, + "kl": 0.4736328125, + "learning_rate": 9.351333333333334e-07, + "loss": 0.0189, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 973 + }, + { + "clip_ratio": 0.0, + "completion_length": 29.125, + "epoch": 0.12986666666666666, + "grad_norm": 12.511069297790527, + "kl": 0.552734375, + "learning_rate": 9.350666666666666e-07, + "loss": 0.0222, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 974 + }, + { + "clip_ratio": 0.0, + "completion_length": 43.875, + "epoch": 0.13, + "grad_norm": 16.339679718017578, + "kl": 0.59765625, + "learning_rate": 9.35e-07, + "loss": 0.0239, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 975 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.9375, + "epoch": 0.13013333333333332, + "grad_norm": 9.305192947387695, + "kl": 0.458984375, + "learning_rate": 9.349333333333332e-07, + "loss": 0.0184, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 976 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.1875, + "epoch": 0.13026666666666667, + "grad_norm": 17.26728630065918, + "kl": 0.765625, + "learning_rate": 9.348666666666666e-07, + "loss": 0.0307, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 977 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.3125, + "epoch": 0.1304, + "grad_norm": 9.67686653137207, + "kl": 0.46484375, + "learning_rate": 9.347999999999999e-07, + "loss": 0.0186, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 978 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.75, + "epoch": 0.13053333333333333, + "grad_norm": 27.643247604370117, + "kl": 0.65234375, + "learning_rate": 9.347333333333332e-07, + "loss": 0.0261, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 979 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.75, + "epoch": 0.13066666666666665, + "grad_norm": 9.526607513427734, + "kl": 0.27490234375, + "learning_rate": 9.346666666666666e-07, + "loss": 0.011, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 980 + }, + { + "clip_ratio": 0.0, + "completion_length": 42.0625, + "epoch": 0.1308, + "grad_norm": 18.749143600463867, + "kl": 1.42236328125, + "learning_rate": 9.345999999999999e-07, + "loss": 0.0568, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 981 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.5, + "epoch": 0.13093333333333335, + "grad_norm": 22.092037200927734, + "kl": 0.275390625, + "learning_rate": 9.345333333333333e-07, + "loss": 0.011, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 982 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.5, + "epoch": 0.13106666666666666, + "grad_norm": 3.1110334396362305, + "kl": 0.5029296875, + "learning_rate": 9.344666666666666e-07, + "loss": 0.0201, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 983 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.875, + "epoch": 0.1312, + "grad_norm": 13.119982719421387, + "kl": 0.548828125, + "learning_rate": 9.344e-07, + "loss": 0.0219, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 984 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.5625, + "epoch": 0.13133333333333333, + "grad_norm": 6.889904022216797, + "kl": 0.61572265625, + "learning_rate": 9.343333333333333e-07, + "loss": 0.0246, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 985 + }, + { + "clip_ratio": 0.0, + "completion_length": 28.3125, + "epoch": 0.13146666666666668, + "grad_norm": 15.732359886169434, + "kl": 0.90625, + "learning_rate": 9.342666666666667e-07, + "loss": 0.0362, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 986 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.375, + "epoch": 0.1316, + "grad_norm": 9.152389526367188, + "kl": 0.3642578125, + "learning_rate": 9.342e-07, + "loss": 0.0146, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 987 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.375, + "epoch": 0.13173333333333334, + "grad_norm": 9.869096755981445, + "kl": 0.162109375, + "learning_rate": 9.341333333333333e-07, + "loss": 0.0065, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 988 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.625, + "epoch": 0.13186666666666666, + "grad_norm": 8.506702423095703, + "kl": 0.1943359375, + "learning_rate": 9.340666666666667e-07, + "loss": 0.0078, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 989 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.375, + "epoch": 0.132, + "grad_norm": 9.133219718933105, + "kl": 0.2197265625, + "learning_rate": 9.34e-07, + "loss": 0.0088, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 990 + }, + { + "clip_ratio": 0.0, + "completion_length": 27.5, + "epoch": 0.13213333333333332, + "grad_norm": 13.382052421569824, + "kl": 0.75390625, + "learning_rate": 9.339333333333333e-07, + "loss": 0.0302, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 991 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.75, + "epoch": 0.13226666666666667, + "grad_norm": 14.224002838134766, + "kl": 0.587890625, + "learning_rate": 9.338666666666666e-07, + "loss": 0.0235, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 992 + }, + { + "clip_ratio": 0.0, + "completion_length": 43.125, + "epoch": 0.1324, + "grad_norm": 14.169334411621094, + "kl": 0.8125, + "learning_rate": 9.338e-07, + "loss": 0.0325, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 993 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.3125, + "epoch": 0.13253333333333334, + "grad_norm": 5.565518856048584, + "kl": 0.375, + "learning_rate": 9.337333333333333e-07, + "loss": 0.015, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 994 + }, + { + "clip_ratio": 0.0, + "completion_length": 28.875, + "epoch": 0.13266666666666665, + "grad_norm": 16.31365966796875, + "kl": 0.9375, + "learning_rate": 9.336666666666666e-07, + "loss": 0.0374, + "reward": 0.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.875, + "step": 995 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.375, + "epoch": 0.1328, + "grad_norm": 10.655659675598145, + "kl": 0.330078125, + "learning_rate": 9.335999999999999e-07, + "loss": 0.0132, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 996 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.4375, + "epoch": 0.13293333333333332, + "grad_norm": 9.60655403137207, + "kl": 0.251953125, + "learning_rate": 9.335333333333332e-07, + "loss": 0.0101, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 997 + }, + { + "clip_ratio": 0.0, + "completion_length": 25.0625, + "epoch": 0.13306666666666667, + "grad_norm": 13.169769287109375, + "kl": 1.09765625, + "learning_rate": 9.334666666666666e-07, + "loss": 0.0439, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 998 + }, + { + "clip_ratio": 0.0, + "completion_length": 17.0, + "epoch": 0.1332, + "grad_norm": 16.17342758178711, + "kl": 1.02734375, + "learning_rate": 9.333999999999999e-07, + "loss": 0.041, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 999 + }, + { + "clip_ratio": 0.0, + "completion_length": 12.625, + "epoch": 0.13333333333333333, + "grad_norm": 24.046024322509766, + "kl": 0.76171875, + "learning_rate": 9.333333333333333e-07, + "loss": 0.0304, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1000 + }, + { + "clip_ratio": 0.0, + "completion_length": 24.0, + "epoch": 0.13346666666666668, + "grad_norm": 36.177978515625, + "kl": 1.01171875, + "learning_rate": 9.332666666666666e-07, + "loss": 0.0405, + "reward": 1.25, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 1001 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.375, + "epoch": 0.1336, + "grad_norm": 1056.2471923828125, + "kl": 10.55078125, + "learning_rate": 9.332e-07, + "loss": 0.4214, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1002 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.0, + "epoch": 0.13373333333333334, + "grad_norm": 11.045502662658691, + "kl": 0.96875, + "learning_rate": 9.331333333333333e-07, + "loss": 0.0388, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 1003 + }, + { + "clip_ratio": 0.0, + "completion_length": 52.6875, + "epoch": 0.13386666666666666, + "grad_norm": 12.682072639465332, + "kl": 0.728515625, + "learning_rate": 9.330666666666667e-07, + "loss": 0.0291, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 1004 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.875, + "epoch": 0.134, + "grad_norm": 19.88066291809082, + "kl": 1.01171875, + "learning_rate": 9.33e-07, + "loss": 0.0404, + "reward": 1.1875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 1005 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.8125, + "epoch": 0.13413333333333333, + "grad_norm": 358.60821533203125, + "kl": 7.32421875, + "learning_rate": 9.329333333333332e-07, + "loss": 0.2937, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1006 + }, + { + "clip_ratio": 0.0, + "completion_length": 15.6875, + "epoch": 0.13426666666666667, + "grad_norm": 9.635062217712402, + "kl": 0.802734375, + "learning_rate": 9.328666666666666e-07, + "loss": 0.032, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1007 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.875, + "epoch": 0.1344, + "grad_norm": 73.40388488769531, + "kl": 0.70703125, + "learning_rate": 9.327999999999999e-07, + "loss": 0.0283, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1008 + }, + { + "clip_ratio": 0.0, + "completion_length": 18.875, + "epoch": 0.13453333333333334, + "grad_norm": 10.448966026306152, + "kl": 0.833984375, + "learning_rate": 9.327333333333333e-07, + "loss": 0.0333, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1009 + }, + { + "clip_ratio": 0.0, + "completion_length": 28.625, + "epoch": 0.13466666666666666, + "grad_norm": 48.78843688964844, + "kl": 3.28125, + "learning_rate": 9.326666666666666e-07, + "loss": 0.1314, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 1010 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.0625, + "epoch": 0.1348, + "grad_norm": 13.950713157653809, + "kl": 0.4267578125, + "learning_rate": 9.326e-07, + "loss": 0.0171, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1011 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.875, + "epoch": 0.13493333333333332, + "grad_norm": 4.845217227935791, + "kl": 0.748046875, + "learning_rate": 9.325333333333333e-07, + "loss": 0.03, + "reward": 1.0625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 1012 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.0, + "epoch": 0.13506666666666667, + "grad_norm": 24.55486488342285, + "kl": 0.5517578125, + "learning_rate": 9.324666666666667e-07, + "loss": 0.0221, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1013 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.1875, + "epoch": 0.1352, + "grad_norm": 33.95841979980469, + "kl": 1.451171875, + "learning_rate": 9.324e-07, + "loss": 0.0581, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 1014 + }, + { + "clip_ratio": 0.0, + "completion_length": 29.5, + "epoch": 0.13533333333333333, + "grad_norm": 14.222892761230469, + "kl": 1.1015625, + "learning_rate": 9.323333333333334e-07, + "loss": 0.044, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1015 + }, + { + "clip_ratio": 0.0, + "completion_length": 12.75, + "epoch": 0.13546666666666668, + "grad_norm": 0.49518442153930664, + "kl": 0.818359375, + "learning_rate": 9.322666666666667e-07, + "loss": 0.0328, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1016 + }, + { + "clip_ratio": 0.0, + "completion_length": 28.25, + "epoch": 0.1356, + "grad_norm": 4.065249443054199, + "kl": 1.05078125, + "learning_rate": 9.321999999999999e-07, + "loss": 0.042, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1017 + }, + { + "clip_ratio": 0.0, + "completion_length": 10.6875, + "epoch": 0.13573333333333334, + "grad_norm": 64.4009780883789, + "kl": 1.1796875, + "learning_rate": 9.321333333333333e-07, + "loss": 0.0473, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1018 + }, + { + "clip_ratio": 0.0, + "completion_length": 31.625, + "epoch": 0.13586666666666666, + "grad_norm": 12.700339317321777, + "kl": 1.01171875, + "learning_rate": 9.320666666666666e-07, + "loss": 0.0404, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 1019 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.3125, + "epoch": 0.136, + "grad_norm": 9.903890609741211, + "kl": 0.400390625, + "learning_rate": 9.32e-07, + "loss": 0.016, + "reward": 1.5625, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 1020 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.5, + "epoch": 0.13613333333333333, + "grad_norm": 22.8408260345459, + "kl": 2.84765625, + "learning_rate": 9.319333333333332e-07, + "loss": 0.1141, + "reward": 1.125, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.8125, + "step": 1021 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.5, + "epoch": 0.13626666666666667, + "grad_norm": 5.633387565612793, + "kl": 0.3720703125, + "learning_rate": 9.318666666666666e-07, + "loss": 0.0148, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1022 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.1875, + "epoch": 0.1364, + "grad_norm": 16.678274154663086, + "kl": 0.630859375, + "learning_rate": 9.317999999999999e-07, + "loss": 0.0252, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1023 + }, + { + "clip_ratio": 0.0, + "completion_length": 42.625, + "epoch": 0.13653333333333334, + "grad_norm": 33.53725814819336, + "kl": 0.6328125, + "learning_rate": 9.317333333333333e-07, + "loss": 0.0253, + "reward": 1.125, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.875, + "step": 1024 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.625, + "epoch": 0.13666666666666666, + "grad_norm": 10.176992416381836, + "kl": 0.43212890625, + "learning_rate": 9.316666666666666e-07, + "loss": 0.0173, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1025 + }, + { + "clip_ratio": 0.0, + "completion_length": 26.5, + "epoch": 0.1368, + "grad_norm": 178.47706604003906, + "kl": 1.146484375, + "learning_rate": 9.315999999999999e-07, + "loss": 0.0459, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 1026 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.3125, + "epoch": 0.13693333333333332, + "grad_norm": 18.546878814697266, + "kl": 0.458984375, + "learning_rate": 9.315333333333333e-07, + "loss": 0.0184, + "reward": 1.125, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 1027 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.3125, + "epoch": 0.13706666666666667, + "grad_norm": 10.664469718933105, + "kl": 0.28271484375, + "learning_rate": 9.314666666666666e-07, + "loss": 0.0113, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 1028 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.625, + "epoch": 0.1372, + "grad_norm": 11.159198760986328, + "kl": 0.3916015625, + "learning_rate": 9.314e-07, + "loss": 0.0157, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1029 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.875, + "epoch": 0.13733333333333334, + "grad_norm": 10.353288650512695, + "kl": 0.23583984375, + "learning_rate": 9.313333333333333e-07, + "loss": 0.0094, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1030 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.0625, + "epoch": 0.13746666666666665, + "grad_norm": 16.241107940673828, + "kl": 0.5751953125, + "learning_rate": 9.312666666666667e-07, + "loss": 0.023, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1031 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.25, + "epoch": 0.1376, + "grad_norm": 9.829272270202637, + "kl": 0.220703125, + "learning_rate": 9.312e-07, + "loss": 0.0088, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1032 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.5, + "epoch": 0.13773333333333335, + "grad_norm": 7.285270690917969, + "kl": 0.6318359375, + "learning_rate": 9.311333333333334e-07, + "loss": 0.0254, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1033 + }, + { + "clip_ratio": 0.0, + "completion_length": 53.875, + "epoch": 0.13786666666666667, + "grad_norm": 81.33642578125, + "kl": 0.5185546875, + "learning_rate": 9.310666666666667e-07, + "loss": 0.0207, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1034 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.4375, + "epoch": 0.138, + "grad_norm": 10.70736026763916, + "kl": 0.4365234375, + "learning_rate": 9.31e-07, + "loss": 0.0174, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1035 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.375, + "epoch": 0.13813333333333333, + "grad_norm": 4.279684543609619, + "kl": 0.39111328125, + "learning_rate": 9.309333333333333e-07, + "loss": 0.0157, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1036 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.625, + "epoch": 0.13826666666666668, + "grad_norm": 12.389534950256348, + "kl": 0.3818359375, + "learning_rate": 9.308666666666665e-07, + "loss": 0.0153, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1037 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.375, + "epoch": 0.1384, + "grad_norm": 8.942885398864746, + "kl": 0.22607421875, + "learning_rate": 9.307999999999999e-07, + "loss": 0.009, + "reward": 1.6875, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.875, + "step": 1038 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.1875, + "epoch": 0.13853333333333334, + "grad_norm": 8.052688598632812, + "kl": 0.20751953125, + "learning_rate": 9.307333333333332e-07, + "loss": 0.0083, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1039 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.5625, + "epoch": 0.13866666666666666, + "grad_norm": 5.809882640838623, + "kl": 0.21728515625, + "learning_rate": 9.306666666666666e-07, + "loss": 0.0087, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1040 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.0625, + "epoch": 0.1388, + "grad_norm": 12.679327964782715, + "kl": 0.435546875, + "learning_rate": 9.305999999999999e-07, + "loss": 0.0174, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 1041 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.5, + "epoch": 0.13893333333333333, + "grad_norm": 5.966148376464844, + "kl": 0.1806640625, + "learning_rate": 9.305333333333333e-07, + "loss": 0.0072, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1042 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.625, + "epoch": 0.13906666666666667, + "grad_norm": 38.4003791809082, + "kl": 2.5205078125, + "learning_rate": 9.304666666666666e-07, + "loss": 0.1011, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 1043 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.5, + "epoch": 0.1392, + "grad_norm": 3.701033592224121, + "kl": 0.2744140625, + "learning_rate": 9.303999999999999e-07, + "loss": 0.011, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1044 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.1875, + "epoch": 0.13933333333333334, + "grad_norm": 17.966934204101562, + "kl": 0.955078125, + "learning_rate": 9.303333333333333e-07, + "loss": 0.0383, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1045 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.75, + "epoch": 0.13946666666666666, + "grad_norm": 10.56161117553711, + "kl": 0.4287109375, + "learning_rate": 9.302666666666666e-07, + "loss": 0.0172, + "reward": 1.3125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 1046 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.1875, + "epoch": 0.1396, + "grad_norm": 7.577552795410156, + "kl": 0.23095703125, + "learning_rate": 9.302e-07, + "loss": 0.0092, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 1047 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.9375, + "epoch": 0.13973333333333332, + "grad_norm": 5.013609886169434, + "kl": 0.3427734375, + "learning_rate": 9.301333333333333e-07, + "loss": 0.0137, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1048 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.375, + "epoch": 0.13986666666666667, + "grad_norm": 7.016177654266357, + "kl": 0.41015625, + "learning_rate": 9.300666666666667e-07, + "loss": 0.0164, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 1049 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.375, + "epoch": 0.14, + "grad_norm": 11.692437171936035, + "kl": 0.3388671875, + "learning_rate": 9.3e-07, + "loss": 0.0136, + "reward": 1.5625, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 1050 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.75, + "epoch": 0.14013333333333333, + "grad_norm": 5.188955307006836, + "kl": 0.25244140625, + "learning_rate": 9.299333333333333e-07, + "loss": 0.0101, + "reward": 0.9375, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.875, + "step": 1051 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.3125, + "epoch": 0.14026666666666668, + "grad_norm": 5.501883506774902, + "kl": 0.26904296875, + "learning_rate": 9.298666666666666e-07, + "loss": 0.0108, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 1052 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.1875, + "epoch": 0.1404, + "grad_norm": 6.276767730712891, + "kl": 0.1611328125, + "learning_rate": 9.297999999999999e-07, + "loss": 0.0064, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1053 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.5, + "epoch": 0.14053333333333334, + "grad_norm": 0.6820192337036133, + "kl": 0.3359375, + "learning_rate": 9.297333333333333e-07, + "loss": 0.0135, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1054 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.0625, + "epoch": 0.14066666666666666, + "grad_norm": 8.083728790283203, + "kl": 0.1845703125, + "learning_rate": 9.296666666666666e-07, + "loss": 0.0074, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1055 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.875, + "epoch": 0.1408, + "grad_norm": 19.874252319335938, + "kl": 0.224609375, + "learning_rate": 9.296e-07, + "loss": 0.009, + "reward": 1.0, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 1056 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.9375, + "epoch": 0.14093333333333333, + "grad_norm": 6.162265300750732, + "kl": 0.1552734375, + "learning_rate": 9.295333333333333e-07, + "loss": 0.0062, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1057 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.6875, + "epoch": 0.14106666666666667, + "grad_norm": 8.430440902709961, + "kl": 0.3828125, + "learning_rate": 9.294666666666667e-07, + "loss": 0.0153, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 1058 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.8125, + "epoch": 0.1412, + "grad_norm": 56.37784957885742, + "kl": 0.22119140625, + "learning_rate": 9.293999999999999e-07, + "loss": 0.0089, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1059 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.9375, + "epoch": 0.14133333333333334, + "grad_norm": 5.50769567489624, + "kl": 0.1513671875, + "learning_rate": 9.293333333333333e-07, + "loss": 0.006, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1060 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.5625, + "epoch": 0.14146666666666666, + "grad_norm": 0.9559260606765747, + "kl": 0.29248046875, + "learning_rate": 9.292666666666666e-07, + "loss": 0.0117, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1061 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.125, + "epoch": 0.1416, + "grad_norm": 15.412026405334473, + "kl": 0.455078125, + "learning_rate": 9.292e-07, + "loss": 0.0182, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 1062 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.1875, + "epoch": 0.14173333333333332, + "grad_norm": 8.251282691955566, + "kl": 0.291015625, + "learning_rate": 9.291333333333333e-07, + "loss": 0.0116, + "reward": 1.4375, + "reward_std": 0.6487165093421936, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 1063 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.875, + "epoch": 0.14186666666666667, + "grad_norm": 6.568239688873291, + "kl": 0.28515625, + "learning_rate": 9.290666666666666e-07, + "loss": 0.0114, + "reward": 1.6875, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 1064 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.6875, + "epoch": 0.142, + "grad_norm": 14.159337043762207, + "kl": 0.3896484375, + "learning_rate": 9.29e-07, + "loss": 0.0156, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1065 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.9375, + "epoch": 0.14213333333333333, + "grad_norm": 8.281296730041504, + "kl": 0.15283203125, + "learning_rate": 9.289333333333333e-07, + "loss": 0.0061, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1066 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.5, + "epoch": 0.14226666666666668, + "grad_norm": 7.475749969482422, + "kl": 0.34765625, + "learning_rate": 9.288666666666666e-07, + "loss": 0.0139, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.75, + "step": 1067 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.0, + "epoch": 0.1424, + "grad_norm": 10.754387855529785, + "kl": 0.232421875, + "learning_rate": 9.287999999999999e-07, + "loss": 0.0093, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1068 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.0625, + "epoch": 0.14253333333333335, + "grad_norm": 40.23849105834961, + "kl": 0.3017578125, + "learning_rate": 9.287333333333333e-07, + "loss": 0.0121, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1069 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.0, + "epoch": 0.14266666666666666, + "grad_norm": 6.6054887771606445, + "kl": 0.212890625, + "learning_rate": 9.286666666666666e-07, + "loss": 0.0085, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1070 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.4375, + "epoch": 0.1428, + "grad_norm": 6.334168434143066, + "kl": 0.16259765625, + "learning_rate": 9.286e-07, + "loss": 0.0065, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1071 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.6875, + "epoch": 0.14293333333333333, + "grad_norm": 5.967108726501465, + "kl": 0.330078125, + "learning_rate": 9.285333333333333e-07, + "loss": 0.0132, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1072 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.0625, + "epoch": 0.14306666666666668, + "grad_norm": 7.434848785400391, + "kl": 0.3564453125, + "learning_rate": 9.284666666666666e-07, + "loss": 0.0143, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 1073 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.625, + "epoch": 0.1432, + "grad_norm": 6.5133748054504395, + "kl": 0.1728515625, + "learning_rate": 9.284e-07, + "loss": 0.0069, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1074 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.6875, + "epoch": 0.14333333333333334, + "grad_norm": 9.974546432495117, + "kl": 0.2685546875, + "learning_rate": 9.283333333333333e-07, + "loss": 0.0107, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 1075 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.375, + "epoch": 0.14346666666666666, + "grad_norm": 5.950429439544678, + "kl": 0.328125, + "learning_rate": 9.282666666666667e-07, + "loss": 0.0132, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1076 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.75, + "epoch": 0.1436, + "grad_norm": 8.245086669921875, + "kl": 0.22119140625, + "learning_rate": 9.282e-07, + "loss": 0.0088, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1077 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.8125, + "epoch": 0.14373333333333332, + "grad_norm": 5.311071395874023, + "kl": 0.34033203125, + "learning_rate": 9.281333333333334e-07, + "loss": 0.0136, + "reward": 1.0625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 1078 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.1875, + "epoch": 0.14386666666666667, + "grad_norm": 9.875997543334961, + "kl": 0.27685546875, + "learning_rate": 9.280666666666667e-07, + "loss": 0.0111, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1079 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.6875, + "epoch": 0.144, + "grad_norm": 8.454545974731445, + "kl": 0.22900390625, + "learning_rate": 9.28e-07, + "loss": 0.0092, + "reward": 1.625, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 1080 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.625, + "epoch": 0.14413333333333334, + "grad_norm": 6.741943836212158, + "kl": 0.18994140625, + "learning_rate": 9.279333333333333e-07, + "loss": 0.0076, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1081 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.75, + "epoch": 0.14426666666666665, + "grad_norm": 5.785976886749268, + "kl": 0.3291015625, + "learning_rate": 9.278666666666665e-07, + "loss": 0.0131, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1082 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.8125, + "epoch": 0.1444, + "grad_norm": 16.459104537963867, + "kl": 0.26806640625, + "learning_rate": 9.277999999999999e-07, + "loss": 0.0107, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 1083 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.875, + "epoch": 0.14453333333333335, + "grad_norm": 4.675604820251465, + "kl": 0.21533203125, + "learning_rate": 9.277333333333332e-07, + "loss": 0.0086, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1084 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.8125, + "epoch": 0.14466666666666667, + "grad_norm": 8.233536720275879, + "kl": 0.17431640625, + "learning_rate": 9.276666666666666e-07, + "loss": 0.007, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1085 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.375, + "epoch": 0.1448, + "grad_norm": 0.35130447149276733, + "kl": 0.14453125, + "learning_rate": 9.275999999999999e-07, + "loss": 0.0058, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 1086 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.875, + "epoch": 0.14493333333333333, + "grad_norm": 9.933679580688477, + "kl": 0.162109375, + "learning_rate": 9.275333333333333e-07, + "loss": 0.0065, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1087 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.0625, + "epoch": 0.14506666666666668, + "grad_norm": 6.850679397583008, + "kl": 0.14453125, + "learning_rate": 9.274666666666666e-07, + "loss": 0.0058, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1088 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.5, + "epoch": 0.1452, + "grad_norm": 6.042683124542236, + "kl": 0.24658203125, + "learning_rate": 9.274e-07, + "loss": 0.0098, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1089 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.4375, + "epoch": 0.14533333333333334, + "grad_norm": 7.19951868057251, + "kl": 0.20263671875, + "learning_rate": 9.273333333333333e-07, + "loss": 0.0081, + "reward": 1.5625, + "reward_std": 0.6199793070554733, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 1090 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.25, + "epoch": 0.14546666666666666, + "grad_norm": 34.523319244384766, + "kl": 0.16845703125, + "learning_rate": 9.272666666666666e-07, + "loss": 0.0067, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1091 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.875, + "epoch": 0.1456, + "grad_norm": 5.639756202697754, + "kl": 0.1484375, + "learning_rate": 9.272e-07, + "loss": 0.0059, + "reward": 1.1875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 1092 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.125, + "epoch": 0.14573333333333333, + "grad_norm": 11.95203685760498, + "kl": 0.302734375, + "learning_rate": 9.271333333333333e-07, + "loss": 0.0121, + "reward": 1.6875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.875, + "step": 1093 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.125, + "epoch": 0.14586666666666667, + "grad_norm": 6.831001281738281, + "kl": 0.22998046875, + "learning_rate": 9.270666666666667e-07, + "loss": 0.0092, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1094 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.4375, + "epoch": 0.146, + "grad_norm": 6.101571559906006, + "kl": 0.2646484375, + "learning_rate": 9.27e-07, + "loss": 0.0106, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1095 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.4375, + "epoch": 0.14613333333333334, + "grad_norm": 9.01187515258789, + "kl": 0.31640625, + "learning_rate": 9.269333333333334e-07, + "loss": 0.0126, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1096 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.625, + "epoch": 0.14626666666666666, + "grad_norm": 6.485546112060547, + "kl": 0.15478515625, + "learning_rate": 9.268666666666666e-07, + "loss": 0.0062, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1097 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.0, + "epoch": 0.1464, + "grad_norm": 9.742229461669922, + "kl": 0.146240234375, + "learning_rate": 9.268e-07, + "loss": 0.0058, + "reward": 1.1875, + "reward_std": 0.6739883720874786, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 1098 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.0, + "epoch": 0.14653333333333332, + "grad_norm": 12.589536666870117, + "kl": 0.2236328125, + "learning_rate": 9.267333333333333e-07, + "loss": 0.0089, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1099 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.5, + "epoch": 0.14666666666666667, + "grad_norm": 11.991933822631836, + "kl": 0.201171875, + "learning_rate": 9.266666666666665e-07, + "loss": 0.0081, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1100 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.3125, + "epoch": 0.1468, + "grad_norm": 4.978590488433838, + "kl": 0.384765625, + "learning_rate": 9.265999999999999e-07, + "loss": 0.0154, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 1101 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.75, + "epoch": 0.14693333333333333, + "grad_norm": 25.061622619628906, + "kl": 0.27392578125, + "learning_rate": 9.265333333333332e-07, + "loss": 0.011, + "reward": 1.0, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 1102 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.0, + "epoch": 0.14706666666666668, + "grad_norm": 14.200199127197266, + "kl": 0.18701171875, + "learning_rate": 9.264666666666666e-07, + "loss": 0.0075, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1103 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.875, + "epoch": 0.1472, + "grad_norm": 9.138663291931152, + "kl": 0.205078125, + "learning_rate": 9.263999999999999e-07, + "loss": 0.0082, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 1104 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.375, + "epoch": 0.14733333333333334, + "grad_norm": 9.543628692626953, + "kl": 0.21044921875, + "learning_rate": 9.263333333333333e-07, + "loss": 0.0084, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1105 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.125, + "epoch": 0.14746666666666666, + "grad_norm": 0.7080317139625549, + "kl": 0.232421875, + "learning_rate": 9.262666666666666e-07, + "loss": 0.0093, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1106 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.6875, + "epoch": 0.1476, + "grad_norm": 10.179909706115723, + "kl": 0.21826171875, + "learning_rate": 9.262e-07, + "loss": 0.0087, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 1107 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.0625, + "epoch": 0.14773333333333333, + "grad_norm": 5.710690975189209, + "kl": 0.22607421875, + "learning_rate": 9.261333333333333e-07, + "loss": 0.009, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1108 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.1875, + "epoch": 0.14786666666666667, + "grad_norm": 6.453667163848877, + "kl": 0.20751953125, + "learning_rate": 9.260666666666667e-07, + "loss": 0.0083, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1109 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.125, + "epoch": 0.148, + "grad_norm": 13.203377723693848, + "kl": 0.37109375, + "learning_rate": 9.26e-07, + "loss": 0.0148, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1110 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.125, + "epoch": 0.14813333333333334, + "grad_norm": 8.34619140625, + "kl": 0.14892578125, + "learning_rate": 9.259333333333333e-07, + "loss": 0.006, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1111 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.8125, + "epoch": 0.14826666666666666, + "grad_norm": 5.90771484375, + "kl": 0.294921875, + "learning_rate": 9.258666666666666e-07, + "loss": 0.0118, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 1112 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.375, + "epoch": 0.1484, + "grad_norm": 8.408160209655762, + "kl": 0.421875, + "learning_rate": 9.257999999999999e-07, + "loss": 0.0169, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 1113 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.0625, + "epoch": 0.14853333333333332, + "grad_norm": 8.031044960021973, + "kl": 0.1982421875, + "learning_rate": 9.257333333333333e-07, + "loss": 0.0079, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1114 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.5625, + "epoch": 0.14866666666666667, + "grad_norm": 17.613990783691406, + "kl": 0.22802734375, + "learning_rate": 9.256666666666666e-07, + "loss": 0.0091, + "reward": 1.625, + "reward_std": 0.6943650841712952, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 1115 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.0, + "epoch": 0.1488, + "grad_norm": 7.69054651260376, + "kl": 0.14697265625, + "learning_rate": 9.256e-07, + "loss": 0.0059, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1116 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.8125, + "epoch": 0.14893333333333333, + "grad_norm": 6.016312599182129, + "kl": 0.19091796875, + "learning_rate": 9.255333333333333e-07, + "loss": 0.0076, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 1117 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.1875, + "epoch": 0.14906666666666665, + "grad_norm": 4.821803092956543, + "kl": 0.18017578125, + "learning_rate": 9.254666666666667e-07, + "loss": 0.0072, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1118 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.625, + "epoch": 0.1492, + "grad_norm": 4.410921096801758, + "kl": 0.150146484375, + "learning_rate": 9.254e-07, + "loss": 0.006, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1119 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.4375, + "epoch": 0.14933333333333335, + "grad_norm": 6.172380447387695, + "kl": 0.19287109375, + "learning_rate": 9.253333333333333e-07, + "loss": 0.0077, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1120 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.8125, + "epoch": 0.14946666666666666, + "grad_norm": 5.507275104522705, + "kl": 0.16796875, + "learning_rate": 9.252666666666667e-07, + "loss": 0.0067, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1121 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.375, + "epoch": 0.1496, + "grad_norm": 7.08358907699585, + "kl": 0.1494140625, + "learning_rate": 9.251999999999999e-07, + "loss": 0.006, + "reward": 1.4375, + "reward_std": 0.7499763667583466, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 1122 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.8125, + "epoch": 0.14973333333333333, + "grad_norm": 6.572317123413086, + "kl": 0.181640625, + "learning_rate": 9.251333333333333e-07, + "loss": 0.0073, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1123 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.8125, + "epoch": 0.14986666666666668, + "grad_norm": 4.892307758331299, + "kl": 0.149658203125, + "learning_rate": 9.250666666666666e-07, + "loss": 0.006, + "reward": 1.25, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 1124 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.4375, + "epoch": 0.15, + "grad_norm": 10.405821800231934, + "kl": 0.18359375, + "learning_rate": 9.25e-07, + "loss": 0.0073, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1125 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.25, + "epoch": 0.15013333333333334, + "grad_norm": 6.2736945152282715, + "kl": 0.1845703125, + "learning_rate": 9.249333333333333e-07, + "loss": 0.0074, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1126 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.0625, + "epoch": 0.15026666666666666, + "grad_norm": 5.3033881187438965, + "kl": 0.3525390625, + "learning_rate": 9.248666666666666e-07, + "loss": 0.0141, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1127 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.75, + "epoch": 0.1504, + "grad_norm": 26.33980941772461, + "kl": 0.26318359375, + "learning_rate": 9.247999999999999e-07, + "loss": 0.0105, + "reward": 1.125, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.875, + "step": 1128 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.125, + "epoch": 0.15053333333333332, + "grad_norm": 7.421051502227783, + "kl": 0.2841796875, + "learning_rate": 9.247333333333332e-07, + "loss": 0.0114, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 1129 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.9375, + "epoch": 0.15066666666666667, + "grad_norm": 16.104204177856445, + "kl": 0.16357421875, + "learning_rate": 9.246666666666666e-07, + "loss": 0.0065, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1130 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.9375, + "epoch": 0.1508, + "grad_norm": 6.043214797973633, + "kl": 0.23974609375, + "learning_rate": 9.245999999999999e-07, + "loss": 0.0096, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1131 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.9375, + "epoch": 0.15093333333333334, + "grad_norm": 5.3343658447265625, + "kl": 0.156005859375, + "learning_rate": 9.245333333333333e-07, + "loss": 0.0062, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1132 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.0, + "epoch": 0.15106666666666665, + "grad_norm": 0.7171711325645447, + "kl": 0.3515625, + "learning_rate": 9.244666666666666e-07, + "loss": 0.0141, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1133 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.9375, + "epoch": 0.1512, + "grad_norm": 5.296905040740967, + "kl": 0.2783203125, + "learning_rate": 9.244e-07, + "loss": 0.0111, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1134 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.6875, + "epoch": 0.15133333333333332, + "grad_norm": 4.456014633178711, + "kl": 0.25537109375, + "learning_rate": 9.243333333333333e-07, + "loss": 0.0102, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1135 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.6875, + "epoch": 0.15146666666666667, + "grad_norm": 5.605232238769531, + "kl": 0.29248046875, + "learning_rate": 9.242666666666667e-07, + "loss": 0.0117, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 1136 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.125, + "epoch": 0.1516, + "grad_norm": 8.729531288146973, + "kl": 0.2822265625, + "learning_rate": 9.242e-07, + "loss": 0.0113, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1137 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.375, + "epoch": 0.15173333333333333, + "grad_norm": 5.024562358856201, + "kl": 0.1708984375, + "learning_rate": 9.241333333333333e-07, + "loss": 0.0068, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1138 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.9375, + "epoch": 0.15186666666666668, + "grad_norm": 7.7777509689331055, + "kl": 0.16455078125, + "learning_rate": 9.240666666666667e-07, + "loss": 0.0066, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1139 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.4375, + "epoch": 0.152, + "grad_norm": 31.658164978027344, + "kl": 0.50146484375, + "learning_rate": 9.24e-07, + "loss": 0.0201, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 1140 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.0, + "epoch": 0.15213333333333334, + "grad_norm": 6.996678829193115, + "kl": 0.16943359375, + "learning_rate": 9.239333333333334e-07, + "loss": 0.0068, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1141 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.625, + "epoch": 0.15226666666666666, + "grad_norm": 6.531402111053467, + "kl": 0.1552734375, + "learning_rate": 9.238666666666665e-07, + "loss": 0.0062, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1142 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.5, + "epoch": 0.1524, + "grad_norm": 5.669389724731445, + "kl": 0.20849609375, + "learning_rate": 9.238e-07, + "loss": 0.0083, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1143 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.75, + "epoch": 0.15253333333333333, + "grad_norm": 9.233237266540527, + "kl": 0.101806640625, + "learning_rate": 9.237333333333332e-07, + "loss": 0.0041, + "reward": 1.4375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1144 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.875, + "epoch": 0.15266666666666667, + "grad_norm": 8.023772239685059, + "kl": 0.18115234375, + "learning_rate": 9.236666666666666e-07, + "loss": 0.0073, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1145 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.125, + "epoch": 0.1528, + "grad_norm": 9.954336166381836, + "kl": 0.25439453125, + "learning_rate": 9.235999999999999e-07, + "loss": 0.0102, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1146 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.9375, + "epoch": 0.15293333333333334, + "grad_norm": 19.7211971282959, + "kl": 0.19921875, + "learning_rate": 9.235333333333332e-07, + "loss": 0.008, + "reward": 1.5625, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 1147 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.375, + "epoch": 0.15306666666666666, + "grad_norm": 6.461864948272705, + "kl": 0.396484375, + "learning_rate": 9.234666666666666e-07, + "loss": 0.0159, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1148 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.5625, + "epoch": 0.1532, + "grad_norm": 6.103114128112793, + "kl": 0.26708984375, + "learning_rate": 9.233999999999999e-07, + "loss": 0.0107, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1149 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.6875, + "epoch": 0.15333333333333332, + "grad_norm": 303.286865234375, + "kl": 0.302734375, + "learning_rate": 9.233333333333333e-07, + "loss": 0.0121, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1150 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.6875, + "epoch": 0.15346666666666667, + "grad_norm": 11.36125373840332, + "kl": 0.53564453125, + "learning_rate": 9.232666666666666e-07, + "loss": 0.0214, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 1151 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.4375, + "epoch": 0.1536, + "grad_norm": 12.953367233276367, + "kl": 0.56201171875, + "learning_rate": 9.232e-07, + "loss": 0.0225, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1152 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.625, + "epoch": 0.15373333333333333, + "grad_norm": 70.30149841308594, + "kl": 0.60546875, + "learning_rate": 9.231333333333333e-07, + "loss": 0.0242, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1153 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.625, + "epoch": 0.15386666666666668, + "grad_norm": 8.479482650756836, + "kl": 0.2861328125, + "learning_rate": 9.230666666666667e-07, + "loss": 0.0115, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1154 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.9375, + "epoch": 0.154, + "grad_norm": 14.0729341506958, + "kl": 0.2109375, + "learning_rate": 9.23e-07, + "loss": 0.0085, + "reward": 1.125, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 1155 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.75, + "epoch": 0.15413333333333334, + "grad_norm": 1.3511873483657837, + "kl": 0.892578125, + "learning_rate": 9.229333333333334e-07, + "loss": 0.0358, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1156 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.375, + "epoch": 0.15426666666666666, + "grad_norm": 8.224930763244629, + "kl": 0.2470703125, + "learning_rate": 9.228666666666666e-07, + "loss": 0.0099, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1157 + }, + { + "clip_ratio": 0.0, + "completion_length": 30.6875, + "epoch": 0.1544, + "grad_norm": 1.0936923027038574, + "kl": 0.8359375, + "learning_rate": 9.227999999999999e-07, + "loss": 0.0335, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1158 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.75, + "epoch": 0.15453333333333333, + "grad_norm": 15.745967864990234, + "kl": 0.4287109375, + "learning_rate": 9.227333333333333e-07, + "loss": 0.0171, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 1159 + }, + { + "clip_ratio": 0.0, + "completion_length": 27.875, + "epoch": 0.15466666666666667, + "grad_norm": 15.497198104858398, + "kl": 0.837890625, + "learning_rate": 9.226666666666666e-07, + "loss": 0.0335, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 1160 + }, + { + "clip_ratio": 0.0, + "completion_length": 47.875, + "epoch": 0.1548, + "grad_norm": 14.756278991699219, + "kl": 0.67578125, + "learning_rate": 9.226e-07, + "loss": 0.027, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1161 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.0, + "epoch": 0.15493333333333334, + "grad_norm": 9.311182975769043, + "kl": 0.21728515625, + "learning_rate": 9.225333333333333e-07, + "loss": 0.0087, + "reward": 1.6875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.875, + "step": 1162 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.8125, + "epoch": 0.15506666666666666, + "grad_norm": 9.674345016479492, + "kl": 0.234375, + "learning_rate": 9.224666666666667e-07, + "loss": 0.0094, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1163 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.1875, + "epoch": 0.1552, + "grad_norm": 21.510433197021484, + "kl": 0.421875, + "learning_rate": 9.224e-07, + "loss": 0.0169, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 1164 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.0625, + "epoch": 0.15533333333333332, + "grad_norm": 9.870179176330566, + "kl": 0.18212890625, + "learning_rate": 9.223333333333333e-07, + "loss": 0.0073, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1165 + }, + { + "clip_ratio": 0.0, + "completion_length": 16.9375, + "epoch": 0.15546666666666667, + "grad_norm": 18.841135025024414, + "kl": 0.888671875, + "learning_rate": 9.222666666666666e-07, + "loss": 0.0355, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1166 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.0625, + "epoch": 0.1556, + "grad_norm": 4.823714733123779, + "kl": 0.1552734375, + "learning_rate": 9.221999999999999e-07, + "loss": 0.0062, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1167 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.0, + "epoch": 0.15573333333333333, + "grad_norm": 11.321463584899902, + "kl": 0.36962890625, + "learning_rate": 9.221333333333333e-07, + "loss": 0.0148, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1168 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.1875, + "epoch": 0.15586666666666665, + "grad_norm": 14.901118278503418, + "kl": 0.36328125, + "learning_rate": 9.220666666666666e-07, + "loss": 0.0145, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 1169 + }, + { + "clip_ratio": 0.0, + "completion_length": 43.4375, + "epoch": 0.156, + "grad_norm": 15.247000694274902, + "kl": 0.845703125, + "learning_rate": 9.22e-07, + "loss": 0.0338, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1170 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.5625, + "epoch": 0.15613333333333335, + "grad_norm": 7.202177047729492, + "kl": 0.2724609375, + "learning_rate": 9.219333333333333e-07, + "loss": 0.0109, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1171 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.0, + "epoch": 0.15626666666666666, + "grad_norm": 8.460097312927246, + "kl": 0.29736328125, + "learning_rate": 9.218666666666666e-07, + "loss": 0.0119, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1172 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.9375, + "epoch": 0.1564, + "grad_norm": 12.65771198272705, + "kl": 0.48828125, + "learning_rate": 9.217999999999999e-07, + "loss": 0.0195, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 1173 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.1875, + "epoch": 0.15653333333333333, + "grad_norm": 7.318619728088379, + "kl": 0.15478515625, + "learning_rate": 9.217333333333333e-07, + "loss": 0.0062, + "reward": 1.4375, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1174 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.5625, + "epoch": 0.15666666666666668, + "grad_norm": 12.7200288772583, + "kl": 0.2490234375, + "learning_rate": 9.216666666666666e-07, + "loss": 0.01, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1175 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.1875, + "epoch": 0.1568, + "grad_norm": 18.576475143432617, + "kl": 0.28271484375, + "learning_rate": 9.215999999999999e-07, + "loss": 0.0113, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 1176 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.4375, + "epoch": 0.15693333333333334, + "grad_norm": 5.305550575256348, + "kl": 0.404296875, + "learning_rate": 9.215333333333333e-07, + "loss": 0.0161, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1177 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.3125, + "epoch": 0.15706666666666666, + "grad_norm": 38.06208801269531, + "kl": 0.23486328125, + "learning_rate": 9.214666666666666e-07, + "loss": 0.0094, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1178 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.0625, + "epoch": 0.1572, + "grad_norm": 0.7250728607177734, + "kl": 0.3916015625, + "learning_rate": 9.214e-07, + "loss": 0.0156, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1179 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.3125, + "epoch": 0.15733333333333333, + "grad_norm": 18.89055824279785, + "kl": 0.265625, + "learning_rate": 9.213333333333333e-07, + "loss": 0.0106, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1180 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.5, + "epoch": 0.15746666666666667, + "grad_norm": 12.545991897583008, + "kl": 0.37548828125, + "learning_rate": 9.212666666666667e-07, + "loss": 0.015, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1181 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.6875, + "epoch": 0.1576, + "grad_norm": 18.917585372924805, + "kl": 0.357421875, + "learning_rate": 9.212e-07, + "loss": 0.0143, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1182 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.9375, + "epoch": 0.15773333333333334, + "grad_norm": 10.09226131439209, + "kl": 0.26708984375, + "learning_rate": 9.211333333333334e-07, + "loss": 0.0107, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1183 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.6875, + "epoch": 0.15786666666666666, + "grad_norm": 111.55253601074219, + "kl": 0.3974609375, + "learning_rate": 9.210666666666667e-07, + "loss": 0.0159, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1184 + }, + { + "clip_ratio": 0.0, + "completion_length": 21.3125, + "epoch": 0.158, + "grad_norm": 1.099952220916748, + "kl": 0.4482421875, + "learning_rate": 9.21e-07, + "loss": 0.0179, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1185 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.6875, + "epoch": 0.15813333333333332, + "grad_norm": 12.575104713439941, + "kl": 0.310546875, + "learning_rate": 9.209333333333333e-07, + "loss": 0.0124, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1186 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.9375, + "epoch": 0.15826666666666667, + "grad_norm": 8.40038776397705, + "kl": 0.24169921875, + "learning_rate": 9.208666666666665e-07, + "loss": 0.0097, + "reward": 1.3125, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 1187 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.375, + "epoch": 0.1584, + "grad_norm": 6.6641645431518555, + "kl": 0.25390625, + "learning_rate": 9.207999999999999e-07, + "loss": 0.0102, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 1188 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.3125, + "epoch": 0.15853333333333333, + "grad_norm": 8.095481872558594, + "kl": 0.24072265625, + "learning_rate": 9.207333333333332e-07, + "loss": 0.0096, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1189 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.125, + "epoch": 0.15866666666666668, + "grad_norm": 8.848591804504395, + "kl": 0.16650390625, + "learning_rate": 9.206666666666666e-07, + "loss": 0.0067, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1190 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.0, + "epoch": 0.1588, + "grad_norm": 4.910694122314453, + "kl": 0.1474609375, + "learning_rate": 9.205999999999999e-07, + "loss": 0.0059, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1191 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.75, + "epoch": 0.15893333333333334, + "grad_norm": 14.036213874816895, + "kl": 0.2646484375, + "learning_rate": 9.205333333333333e-07, + "loss": 0.0106, + "reward": 1.3125, + "reward_std": 0.5876962244510651, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 1192 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.9375, + "epoch": 0.15906666666666666, + "grad_norm": 7.981796741485596, + "kl": 0.212890625, + "learning_rate": 9.204666666666666e-07, + "loss": 0.0085, + "reward": 1.3125, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 1193 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.5625, + "epoch": 0.1592, + "grad_norm": 16.793212890625, + "kl": 0.221923828125, + "learning_rate": 9.203999999999999e-07, + "loss": 0.0089, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1194 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.5625, + "epoch": 0.15933333333333333, + "grad_norm": 0.5374444723129272, + "kl": 0.2802734375, + "learning_rate": 9.203333333333333e-07, + "loss": 0.0112, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 1195 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.8125, + "epoch": 0.15946666666666667, + "grad_norm": 12.295165061950684, + "kl": 0.203369140625, + "learning_rate": 9.202666666666666e-07, + "loss": 0.0081, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1196 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.1875, + "epoch": 0.1596, + "grad_norm": 9.687755584716797, + "kl": 0.21044921875, + "learning_rate": 9.202e-07, + "loss": 0.0084, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1197 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.25, + "epoch": 0.15973333333333334, + "grad_norm": 5.924111843109131, + "kl": 0.209716796875, + "learning_rate": 9.201333333333333e-07, + "loss": 0.0084, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1198 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.75, + "epoch": 0.15986666666666666, + "grad_norm": 7.723039627075195, + "kl": 0.1494140625, + "learning_rate": 9.200666666666667e-07, + "loss": 0.006, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1199 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.5625, + "epoch": 0.16, + "grad_norm": 9.645587921142578, + "kl": 0.1826171875, + "learning_rate": 9.2e-07, + "loss": 0.0073, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1200 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.8125, + "epoch": 0.16013333333333332, + "grad_norm": 4.872499465942383, + "kl": 0.24951171875, + "learning_rate": 9.199333333333334e-07, + "loss": 0.01, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1201 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.375, + "epoch": 0.16026666666666667, + "grad_norm": 6.78210973739624, + "kl": 0.29150390625, + "learning_rate": 9.198666666666667e-07, + "loss": 0.0117, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1202 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.875, + "epoch": 0.1604, + "grad_norm": 4.250141143798828, + "kl": 0.232421875, + "learning_rate": 9.197999999999999e-07, + "loss": 0.0093, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1203 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.9375, + "epoch": 0.16053333333333333, + "grad_norm": 37.78932189941406, + "kl": 0.27001953125, + "learning_rate": 9.197333333333333e-07, + "loss": 0.0108, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1204 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.125, + "epoch": 0.16066666666666668, + "grad_norm": 17.067752838134766, + "kl": 0.28369140625, + "learning_rate": 9.196666666666666e-07, + "loss": 0.0114, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 1205 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.8125, + "epoch": 0.1608, + "grad_norm": 6.654172420501709, + "kl": 0.2119140625, + "learning_rate": 9.196e-07, + "loss": 0.0085, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1206 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.0, + "epoch": 0.16093333333333334, + "grad_norm": 9.491950035095215, + "kl": 0.203125, + "learning_rate": 9.195333333333332e-07, + "loss": 0.0081, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1207 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.5, + "epoch": 0.16106666666666666, + "grad_norm": 9.376502990722656, + "kl": 0.63037109375, + "learning_rate": 9.194666666666666e-07, + "loss": 0.0251, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1208 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.8125, + "epoch": 0.1612, + "grad_norm": 12.27082347869873, + "kl": 0.6455078125, + "learning_rate": 9.193999999999999e-07, + "loss": 0.0258, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 1209 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.6875, + "epoch": 0.16133333333333333, + "grad_norm": 7.480679035186768, + "kl": 0.244140625, + "learning_rate": 9.193333333333333e-07, + "loss": 0.0098, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1210 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.9375, + "epoch": 0.16146666666666668, + "grad_norm": 8.17317008972168, + "kl": 0.26123046875, + "learning_rate": 9.192666666666666e-07, + "loss": 0.0105, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1211 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.0625, + "epoch": 0.1616, + "grad_norm": 10.0855712890625, + "kl": 0.2392578125, + "learning_rate": 9.192e-07, + "loss": 0.0096, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1212 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.375, + "epoch": 0.16173333333333334, + "grad_norm": 4.67252254486084, + "kl": 0.24267578125, + "learning_rate": 9.191333333333333e-07, + "loss": 0.0097, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1213 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.625, + "epoch": 0.16186666666666666, + "grad_norm": 13.302578926086426, + "kl": 0.31494140625, + "learning_rate": 9.190666666666666e-07, + "loss": 0.0126, + "reward": 1.4375, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1214 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.1875, + "epoch": 0.162, + "grad_norm": 10.260138511657715, + "kl": 0.23046875, + "learning_rate": 9.19e-07, + "loss": 0.0092, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1215 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.375, + "epoch": 0.16213333333333332, + "grad_norm": 7.018295764923096, + "kl": 0.22412109375, + "learning_rate": 9.189333333333333e-07, + "loss": 0.0089, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1216 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.125, + "epoch": 0.16226666666666667, + "grad_norm": 9.956268310546875, + "kl": 0.41015625, + "learning_rate": 9.188666666666667e-07, + "loss": 0.0164, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 1217 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.375, + "epoch": 0.1624, + "grad_norm": 4.7056193351745605, + "kl": 0.2451171875, + "learning_rate": 9.187999999999999e-07, + "loss": 0.0098, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 1218 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.5625, + "epoch": 0.16253333333333334, + "grad_norm": 9.83360481262207, + "kl": 0.3701171875, + "learning_rate": 9.187333333333333e-07, + "loss": 0.0148, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1219 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.875, + "epoch": 0.16266666666666665, + "grad_norm": 7.439918041229248, + "kl": 0.36669921875, + "learning_rate": 9.186666666666666e-07, + "loss": 0.0146, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1220 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.6875, + "epoch": 0.1628, + "grad_norm": 9.110074996948242, + "kl": 0.322265625, + "learning_rate": 9.186e-07, + "loss": 0.0129, + "reward": 1.125, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.875, + "step": 1221 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.9375, + "epoch": 0.16293333333333335, + "grad_norm": 15.416579246520996, + "kl": 0.39453125, + "learning_rate": 9.185333333333333e-07, + "loss": 0.0158, + "reward": 1.5, + "reward_std": 0.6746576428413391, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 1222 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.375, + "epoch": 0.16306666666666667, + "grad_norm": 12.759389877319336, + "kl": 0.263671875, + "learning_rate": 9.184666666666666e-07, + "loss": 0.0105, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1223 + }, + { + "clip_ratio": 0.0, + "completion_length": 28.125, + "epoch": 0.1632, + "grad_norm": 62.00385665893555, + "kl": 1.359375, + "learning_rate": 9.184e-07, + "loss": 0.0544, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 1224 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.3125, + "epoch": 0.16333333333333333, + "grad_norm": 14.179203033447266, + "kl": 0.310546875, + "learning_rate": 9.183333333333333e-07, + "loss": 0.0124, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1225 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.1875, + "epoch": 0.16346666666666668, + "grad_norm": 15.225814819335938, + "kl": 0.697265625, + "learning_rate": 9.182666666666667e-07, + "loss": 0.028, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 1226 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.125, + "epoch": 0.1636, + "grad_norm": 10.44553279876709, + "kl": 0.373046875, + "learning_rate": 9.182e-07, + "loss": 0.0149, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1227 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.75, + "epoch": 0.16373333333333334, + "grad_norm": 2.0769596099853516, + "kl": 0.595703125, + "learning_rate": 9.181333333333333e-07, + "loss": 0.0237, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1228 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.3125, + "epoch": 0.16386666666666666, + "grad_norm": 9.82990837097168, + "kl": 0.4423828125, + "learning_rate": 9.180666666666666e-07, + "loss": 0.0176, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1229 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.6875, + "epoch": 0.164, + "grad_norm": 13.14173698425293, + "kl": 0.4833984375, + "learning_rate": 9.18e-07, + "loss": 0.0193, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1230 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.0, + "epoch": 0.16413333333333333, + "grad_norm": 7.677962303161621, + "kl": 0.517578125, + "learning_rate": 9.179333333333333e-07, + "loss": 0.0207, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1231 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.125, + "epoch": 0.16426666666666667, + "grad_norm": 10.121443748474121, + "kl": 0.646484375, + "learning_rate": 9.178666666666666e-07, + "loss": 0.0258, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1232 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.0625, + "epoch": 0.1644, + "grad_norm": 8.88951301574707, + "kl": 0.5107421875, + "learning_rate": 9.177999999999999e-07, + "loss": 0.0205, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 1233 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.875, + "epoch": 0.16453333333333334, + "grad_norm": 8.177672386169434, + "kl": 0.689453125, + "learning_rate": 9.177333333333332e-07, + "loss": 0.0275, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1234 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.25, + "epoch": 0.16466666666666666, + "grad_norm": 8.886040687561035, + "kl": 0.248046875, + "learning_rate": 9.176666666666666e-07, + "loss": 0.0099, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1235 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.0, + "epoch": 0.1648, + "grad_norm": 6.027076244354248, + "kl": 0.5791015625, + "learning_rate": 9.175999999999999e-07, + "loss": 0.0232, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1236 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.25, + "epoch": 0.16493333333333332, + "grad_norm": 66.80809020996094, + "kl": 0.8125, + "learning_rate": 9.175333333333333e-07, + "loss": 0.0326, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 1237 + }, + { + "clip_ratio": 0.0, + "completion_length": 52.875, + "epoch": 0.16506666666666667, + "grad_norm": 14.42548656463623, + "kl": 0.4404296875, + "learning_rate": 9.174666666666666e-07, + "loss": 0.0176, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1238 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.25, + "epoch": 0.1652, + "grad_norm": 13.627549171447754, + "kl": 0.279296875, + "learning_rate": 9.174e-07, + "loss": 0.0112, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1239 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.8125, + "epoch": 0.16533333333333333, + "grad_norm": 8.55677604675293, + "kl": 0.337890625, + "learning_rate": 9.173333333333333e-07, + "loss": 0.0135, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1240 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.625, + "epoch": 0.16546666666666668, + "grad_norm": 9.230977058410645, + "kl": 0.5390625, + "learning_rate": 9.172666666666666e-07, + "loss": 0.0216, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1241 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.375, + "epoch": 0.1656, + "grad_norm": 6.814620494842529, + "kl": 0.44775390625, + "learning_rate": 9.172e-07, + "loss": 0.0179, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1242 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.5, + "epoch": 0.16573333333333334, + "grad_norm": 18.845949172973633, + "kl": 0.4892578125, + "learning_rate": 9.171333333333333e-07, + "loss": 0.0196, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1243 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.375, + "epoch": 0.16586666666666666, + "grad_norm": 10.205952644348145, + "kl": 0.27685546875, + "learning_rate": 9.170666666666667e-07, + "loss": 0.0111, + "reward": 1.6875, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 1244 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.1875, + "epoch": 0.166, + "grad_norm": 4.436732292175293, + "kl": 0.26513671875, + "learning_rate": 9.17e-07, + "loss": 0.0106, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1245 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.9375, + "epoch": 0.16613333333333333, + "grad_norm": 7.121496200561523, + "kl": 0.458984375, + "learning_rate": 9.169333333333334e-07, + "loss": 0.0183, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 1246 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.75, + "epoch": 0.16626666666666667, + "grad_norm": 10.256389617919922, + "kl": 0.2412109375, + "learning_rate": 9.168666666666667e-07, + "loss": 0.0097, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1247 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.0625, + "epoch": 0.1664, + "grad_norm": 0.7827883362770081, + "kl": 0.392578125, + "learning_rate": 9.168e-07, + "loss": 0.0157, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1248 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.8125, + "epoch": 0.16653333333333334, + "grad_norm": 13.562933921813965, + "kl": 0.4921875, + "learning_rate": 9.167333333333332e-07, + "loss": 0.0197, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 1249 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.125, + "epoch": 0.16666666666666666, + "grad_norm": 4.3345866203308105, + "kl": 0.28173828125, + "learning_rate": 9.166666666666665e-07, + "loss": 0.0112, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 1250 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.125, + "epoch": 0.1668, + "grad_norm": 7.734755039215088, + "kl": 0.22412109375, + "learning_rate": 9.165999999999999e-07, + "loss": 0.009, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1251 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.9375, + "epoch": 0.16693333333333332, + "grad_norm": 12.85533332824707, + "kl": 0.42578125, + "learning_rate": 9.165333333333332e-07, + "loss": 0.017, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1252 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.1875, + "epoch": 0.16706666666666667, + "grad_norm": 8.594717025756836, + "kl": 0.32763671875, + "learning_rate": 9.164666666666666e-07, + "loss": 0.0131, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1253 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.4375, + "epoch": 0.1672, + "grad_norm": 7.1599040031433105, + "kl": 0.2138671875, + "learning_rate": 9.163999999999999e-07, + "loss": 0.0085, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 1254 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.8125, + "epoch": 0.16733333333333333, + "grad_norm": 14.562186241149902, + "kl": 0.2578125, + "learning_rate": 9.163333333333333e-07, + "loss": 0.0103, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1255 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.625, + "epoch": 0.16746666666666668, + "grad_norm": 7.2197651863098145, + "kl": 0.37890625, + "learning_rate": 9.162666666666666e-07, + "loss": 0.0152, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 1256 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.4375, + "epoch": 0.1676, + "grad_norm": 9.147486686706543, + "kl": 0.27880859375, + "learning_rate": 9.162e-07, + "loss": 0.0111, + "reward": 1.4375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1257 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.375, + "epoch": 0.16773333333333335, + "grad_norm": 6.096780776977539, + "kl": 0.23291015625, + "learning_rate": 9.161333333333333e-07, + "loss": 0.0093, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1258 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.8125, + "epoch": 0.16786666666666666, + "grad_norm": 7.271346092224121, + "kl": 0.3037109375, + "learning_rate": 9.160666666666667e-07, + "loss": 0.0122, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1259 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.6875, + "epoch": 0.168, + "grad_norm": 5.030425071716309, + "kl": 0.32177734375, + "learning_rate": 9.16e-07, + "loss": 0.0129, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1260 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.125, + "epoch": 0.16813333333333333, + "grad_norm": 11.635584831237793, + "kl": 0.48046875, + "learning_rate": 9.159333333333333e-07, + "loss": 0.0193, + "reward": 1.625, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 1261 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.625, + "epoch": 0.16826666666666668, + "grad_norm": 12.78083324432373, + "kl": 0.548828125, + "learning_rate": 9.158666666666667e-07, + "loss": 0.0219, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1262 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.9375, + "epoch": 0.1684, + "grad_norm": 10.858388900756836, + "kl": 0.451171875, + "learning_rate": 9.157999999999999e-07, + "loss": 0.0181, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1263 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.0625, + "epoch": 0.16853333333333334, + "grad_norm": 10.682758331298828, + "kl": 0.494140625, + "learning_rate": 9.157333333333333e-07, + "loss": 0.0197, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 1264 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.4375, + "epoch": 0.16866666666666666, + "grad_norm": 5.1373467445373535, + "kl": 0.345703125, + "learning_rate": 9.156666666666666e-07, + "loss": 0.0138, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1265 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.0625, + "epoch": 0.1688, + "grad_norm": 21.63310432434082, + "kl": 0.39453125, + "learning_rate": 9.156e-07, + "loss": 0.0158, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 1266 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.3125, + "epoch": 0.16893333333333332, + "grad_norm": 10.669861793518066, + "kl": 0.458984375, + "learning_rate": 9.155333333333333e-07, + "loss": 0.0184, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 1267 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.25, + "epoch": 0.16906666666666667, + "grad_norm": 13.000301361083984, + "kl": 0.5302734375, + "learning_rate": 9.154666666666667e-07, + "loss": 0.0212, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1268 + }, + { + "clip_ratio": 0.0, + "completion_length": 48.375, + "epoch": 0.1692, + "grad_norm": 0.7966198325157166, + "kl": 0.44140625, + "learning_rate": 9.154e-07, + "loss": 0.0177, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1269 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.5625, + "epoch": 0.16933333333333334, + "grad_norm": 7.712100028991699, + "kl": 0.505859375, + "learning_rate": 9.153333333333332e-07, + "loss": 0.0202, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1270 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.0, + "epoch": 0.16946666666666665, + "grad_norm": 7.323558330535889, + "kl": 0.3076171875, + "learning_rate": 9.152666666666666e-07, + "loss": 0.0123, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1271 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.4375, + "epoch": 0.1696, + "grad_norm": 12.76833724975586, + "kl": 0.38720703125, + "learning_rate": 9.151999999999999e-07, + "loss": 0.0155, + "reward": 1.1875, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 1272 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.875, + "epoch": 0.16973333333333335, + "grad_norm": 13.956140518188477, + "kl": 0.3720703125, + "learning_rate": 9.151333333333333e-07, + "loss": 0.0149, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 1273 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.625, + "epoch": 0.16986666666666667, + "grad_norm": 7.017806529998779, + "kl": 0.3623046875, + "learning_rate": 9.150666666666666e-07, + "loss": 0.0145, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 1274 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.5625, + "epoch": 0.17, + "grad_norm": 10.31252384185791, + "kl": 0.287109375, + "learning_rate": 9.15e-07, + "loss": 0.0115, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 1275 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.375, + "epoch": 0.17013333333333333, + "grad_norm": 47.25616455078125, + "kl": 0.34423828125, + "learning_rate": 9.149333333333333e-07, + "loss": 0.0138, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 1276 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.125, + "epoch": 0.17026666666666668, + "grad_norm": 19.471858978271484, + "kl": 0.3876953125, + "learning_rate": 9.148666666666667e-07, + "loss": 0.0155, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 1277 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.75, + "epoch": 0.1704, + "grad_norm": 9.416760444641113, + "kl": 0.30224609375, + "learning_rate": 9.147999999999999e-07, + "loss": 0.0121, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 1278 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.875, + "epoch": 0.17053333333333334, + "grad_norm": 19.937725067138672, + "kl": 0.3330078125, + "learning_rate": 9.147333333333332e-07, + "loss": 0.0133, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1279 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.6875, + "epoch": 0.17066666666666666, + "grad_norm": 8.455708503723145, + "kl": 0.2158203125, + "learning_rate": 9.146666666666666e-07, + "loss": 0.0086, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1280 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.8125, + "epoch": 0.1708, + "grad_norm": 7.553625583648682, + "kl": 0.22705078125, + "learning_rate": 9.145999999999999e-07, + "loss": 0.0091, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 1281 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.75, + "epoch": 0.17093333333333333, + "grad_norm": 11.295023918151855, + "kl": 0.291015625, + "learning_rate": 9.145333333333333e-07, + "loss": 0.0116, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1282 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.4375, + "epoch": 0.17106666666666667, + "grad_norm": 15.674420356750488, + "kl": 0.40625, + "learning_rate": 9.144666666666666e-07, + "loss": 0.0163, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1283 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.25, + "epoch": 0.1712, + "grad_norm": 7.8951215744018555, + "kl": 0.2744140625, + "learning_rate": 9.144e-07, + "loss": 0.011, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1284 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.625, + "epoch": 0.17133333333333334, + "grad_norm": 0.5734572410583496, + "kl": 0.2939453125, + "learning_rate": 9.143333333333333e-07, + "loss": 0.0117, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1285 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.125, + "epoch": 0.17146666666666666, + "grad_norm": 8.314566612243652, + "kl": 0.26953125, + "learning_rate": 9.142666666666667e-07, + "loss": 0.0108, + "reward": 1.1875, + "reward_std": 0.6739883720874786, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 1286 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.625, + "epoch": 0.1716, + "grad_norm": 7.156904220581055, + "kl": 0.294921875, + "learning_rate": 9.142e-07, + "loss": 0.0118, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1287 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.625, + "epoch": 0.17173333333333332, + "grad_norm": 1.1681872606277466, + "kl": 0.279296875, + "learning_rate": 9.141333333333333e-07, + "loss": 0.0112, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1288 + }, + { + "clip_ratio": 0.0, + "completion_length": 53.8125, + "epoch": 0.17186666666666667, + "grad_norm": 0.6953992247581482, + "kl": 0.2958984375, + "learning_rate": 9.140666666666667e-07, + "loss": 0.0118, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1289 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.6875, + "epoch": 0.172, + "grad_norm": 3.710512161254883, + "kl": 0.28173828125, + "learning_rate": 9.14e-07, + "loss": 0.0113, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1290 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.625, + "epoch": 0.17213333333333333, + "grad_norm": 7.304062843322754, + "kl": 0.25, + "learning_rate": 9.139333333333334e-07, + "loss": 0.01, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1291 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.9375, + "epoch": 0.17226666666666668, + "grad_norm": 9.055727005004883, + "kl": 0.21875, + "learning_rate": 9.138666666666666e-07, + "loss": 0.0087, + "reward": 1.0625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 1292 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.9375, + "epoch": 0.1724, + "grad_norm": 271.79510498046875, + "kl": 0.2421875, + "learning_rate": 9.137999999999999e-07, + "loss": 0.0097, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1293 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.5625, + "epoch": 0.17253333333333334, + "grad_norm": 16.118667602539062, + "kl": 0.23193359375, + "learning_rate": 9.137333333333332e-07, + "loss": 0.0093, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1294 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.4375, + "epoch": 0.17266666666666666, + "grad_norm": 6.505355358123779, + "kl": 0.21630859375, + "learning_rate": 9.136666666666666e-07, + "loss": 0.0086, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 1295 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.25, + "epoch": 0.1728, + "grad_norm": 6.649810314178467, + "kl": 0.16357421875, + "learning_rate": 9.135999999999999e-07, + "loss": 0.0065, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1296 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.8125, + "epoch": 0.17293333333333333, + "grad_norm": 6.708252429962158, + "kl": 0.376953125, + "learning_rate": 9.135333333333332e-07, + "loss": 0.0151, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 1297 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.6875, + "epoch": 0.17306666666666667, + "grad_norm": 10.36716079711914, + "kl": 0.22412109375, + "learning_rate": 9.134666666666666e-07, + "loss": 0.009, + "reward": 1.625, + "reward_std": 0.6348394006490707, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 1298 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.0625, + "epoch": 0.1732, + "grad_norm": 0.5774011015892029, + "kl": 0.33056640625, + "learning_rate": 9.133999999999999e-07, + "loss": 0.0132, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1299 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.6875, + "epoch": 0.17333333333333334, + "grad_norm": 0.6491295099258423, + "kl": 0.220703125, + "learning_rate": 9.133333333333333e-07, + "loss": 0.0088, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1300 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.25, + "epoch": 0.17346666666666666, + "grad_norm": 7.753295421600342, + "kl": 0.2197265625, + "learning_rate": 9.132666666666666e-07, + "loss": 0.0088, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 1301 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.9375, + "epoch": 0.1736, + "grad_norm": 8.068643569946289, + "kl": 0.28125, + "learning_rate": 9.132e-07, + "loss": 0.0112, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1302 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.3125, + "epoch": 0.17373333333333332, + "grad_norm": 56.73080825805664, + "kl": 0.3623046875, + "learning_rate": 9.131333333333333e-07, + "loss": 0.0145, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1303 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.3125, + "epoch": 0.17386666666666667, + "grad_norm": 8.199398040771484, + "kl": 0.2734375, + "learning_rate": 9.130666666666667e-07, + "loss": 0.011, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1304 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.875, + "epoch": 0.174, + "grad_norm": 15.643369674682617, + "kl": 0.978515625, + "learning_rate": 9.13e-07, + "loss": 0.039, + "reward": 1.25, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 1305 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.125, + "epoch": 0.17413333333333333, + "grad_norm": 5.3225202560424805, + "kl": 0.22509765625, + "learning_rate": 9.129333333333334e-07, + "loss": 0.009, + "reward": 0.9375, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.875, + "step": 1306 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.375, + "epoch": 0.17426666666666665, + "grad_norm": 8.413633346557617, + "kl": 0.2333984375, + "learning_rate": 9.128666666666667e-07, + "loss": 0.0093, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1307 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.3125, + "epoch": 0.1744, + "grad_norm": 5.808276653289795, + "kl": 0.21728515625, + "learning_rate": 9.127999999999999e-07, + "loss": 0.0087, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 1308 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.5625, + "epoch": 0.17453333333333335, + "grad_norm": 31.723291397094727, + "kl": 0.16455078125, + "learning_rate": 9.127333333333333e-07, + "loss": 0.0066, + "reward": 1.4375, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1309 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.1875, + "epoch": 0.17466666666666666, + "grad_norm": 7.967096328735352, + "kl": 0.2041015625, + "learning_rate": 9.126666666666666e-07, + "loss": 0.0082, + "reward": 1.25, + "reward_std": 0.7301712930202484, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.8125, + "step": 1310 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.6875, + "epoch": 0.1748, + "grad_norm": 12.165507316589355, + "kl": 0.22509765625, + "learning_rate": 9.126e-07, + "loss": 0.009, + "reward": 1.5, + "reward_std": 0.7071067690849304, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 1311 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.5, + "epoch": 0.17493333333333333, + "grad_norm": 5.771701335906982, + "kl": 0.26318359375, + "learning_rate": 9.125333333333332e-07, + "loss": 0.0105, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 1312 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.0, + "epoch": 0.17506666666666668, + "grad_norm": 5.458629608154297, + "kl": 0.20263671875, + "learning_rate": 9.124666666666666e-07, + "loss": 0.0081, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 1313 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.5, + "epoch": 0.1752, + "grad_norm": 6.395997524261475, + "kl": 0.20458984375, + "learning_rate": 9.123999999999999e-07, + "loss": 0.0082, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1314 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.75, + "epoch": 0.17533333333333334, + "grad_norm": 7.80338716506958, + "kl": 0.2568359375, + "learning_rate": 9.123333333333333e-07, + "loss": 0.0103, + "reward": 1.25, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 1315 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.0625, + "epoch": 0.17546666666666666, + "grad_norm": 4.9379119873046875, + "kl": 0.22607421875, + "learning_rate": 9.122666666666666e-07, + "loss": 0.009, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1316 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.0, + "epoch": 0.1756, + "grad_norm": 24.99510383605957, + "kl": 1.044921875, + "learning_rate": 9.121999999999999e-07, + "loss": 0.0419, + "reward": 1.6875, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 1317 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.375, + "epoch": 0.17573333333333332, + "grad_norm": 4.121872425079346, + "kl": 0.2021484375, + "learning_rate": 9.121333333333333e-07, + "loss": 0.0081, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1318 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.125, + "epoch": 0.17586666666666667, + "grad_norm": 3.591230869293213, + "kl": 0.19677734375, + "learning_rate": 9.120666666666666e-07, + "loss": 0.0079, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 1319 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.6875, + "epoch": 0.176, + "grad_norm": 5.536583423614502, + "kl": 0.2158203125, + "learning_rate": 9.12e-07, + "loss": 0.0086, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1320 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.5625, + "epoch": 0.17613333333333334, + "grad_norm": 9.553511619567871, + "kl": 0.33203125, + "learning_rate": 9.119333333333333e-07, + "loss": 0.0133, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 1321 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.625, + "epoch": 0.17626666666666665, + "grad_norm": 5.46744966506958, + "kl": 0.302734375, + "learning_rate": 9.118666666666667e-07, + "loss": 0.0121, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 1322 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.9375, + "epoch": 0.1764, + "grad_norm": 4.763887882232666, + "kl": 0.21435546875, + "learning_rate": 9.118e-07, + "loss": 0.0086, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1323 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.8125, + "epoch": 0.17653333333333332, + "grad_norm": 5.471235275268555, + "kl": 0.234375, + "learning_rate": 9.117333333333333e-07, + "loss": 0.0094, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1324 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.875, + "epoch": 0.17666666666666667, + "grad_norm": 7.31444787979126, + "kl": 0.28173828125, + "learning_rate": 9.116666666666666e-07, + "loss": 0.0113, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1325 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.8125, + "epoch": 0.1768, + "grad_norm": 11.946218490600586, + "kl": 0.2275390625, + "learning_rate": 9.115999999999999e-07, + "loss": 0.0091, + "reward": 1.25, + "reward_std": 0.6452257037162781, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 1326 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.375, + "epoch": 0.17693333333333333, + "grad_norm": 4.460653781890869, + "kl": 0.19873046875, + "learning_rate": 9.115333333333333e-07, + "loss": 0.008, + "reward": 1.6875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 1327 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.375, + "epoch": 0.17706666666666668, + "grad_norm": 6.918564319610596, + "kl": 0.16748046875, + "learning_rate": 9.114666666666666e-07, + "loss": 0.0067, + "reward": 0.9375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.875, + "step": 1328 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.25, + "epoch": 0.1772, + "grad_norm": 5.836019039154053, + "kl": 0.14404296875, + "learning_rate": 9.114e-07, + "loss": 0.0058, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1329 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.125, + "epoch": 0.17733333333333334, + "grad_norm": 7.671666622161865, + "kl": 0.17138671875, + "learning_rate": 9.113333333333333e-07, + "loss": 0.0069, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1330 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.125, + "epoch": 0.17746666666666666, + "grad_norm": 10.114519119262695, + "kl": 0.19189453125, + "learning_rate": 9.112666666666667e-07, + "loss": 0.0077, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.8125, + "step": 1331 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.3125, + "epoch": 0.1776, + "grad_norm": 31.20867347717285, + "kl": 0.23388671875, + "learning_rate": 9.112e-07, + "loss": 0.0094, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 1332 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.4375, + "epoch": 0.17773333333333333, + "grad_norm": 7.196924686431885, + "kl": 0.21630859375, + "learning_rate": 9.111333333333334e-07, + "loss": 0.0086, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1333 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.375, + "epoch": 0.17786666666666667, + "grad_norm": 5.397729873657227, + "kl": 0.21240234375, + "learning_rate": 9.110666666666666e-07, + "loss": 0.0085, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1334 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.75, + "epoch": 0.178, + "grad_norm": 12.288978576660156, + "kl": 0.2958984375, + "learning_rate": 9.109999999999999e-07, + "loss": 0.0118, + "reward": 1.6875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 1335 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.5625, + "epoch": 0.17813333333333334, + "grad_norm": 9.685152053833008, + "kl": 0.20703125, + "learning_rate": 9.109333333333333e-07, + "loss": 0.0083, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1336 + }, + { + "clip_ratio": 0.0, + "completion_length": 53.125, + "epoch": 0.17826666666666666, + "grad_norm": 11.331077575683594, + "kl": 0.35693359375, + "learning_rate": 9.108666666666666e-07, + "loss": 0.0142, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 1337 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.0, + "epoch": 0.1784, + "grad_norm": 8.751553535461426, + "kl": 0.16015625, + "learning_rate": 9.108e-07, + "loss": 0.0064, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1338 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.5, + "epoch": 0.17853333333333332, + "grad_norm": 9.900606155395508, + "kl": 0.20556640625, + "learning_rate": 9.107333333333332e-07, + "loss": 0.0082, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1339 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.8125, + "epoch": 0.17866666666666667, + "grad_norm": 6.502462863922119, + "kl": 0.17236328125, + "learning_rate": 9.106666666666666e-07, + "loss": 0.0069, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1340 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.3125, + "epoch": 0.1788, + "grad_norm": 6.407998085021973, + "kl": 0.116455078125, + "learning_rate": 9.105999999999999e-07, + "loss": 0.0047, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1341 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.1875, + "epoch": 0.17893333333333333, + "grad_norm": 5.72733211517334, + "kl": 0.14111328125, + "learning_rate": 9.105333333333333e-07, + "loss": 0.0056, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1342 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.25, + "epoch": 0.17906666666666668, + "grad_norm": 8.057046890258789, + "kl": 0.20751953125, + "learning_rate": 9.104666666666666e-07, + "loss": 0.0083, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1343 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.25, + "epoch": 0.1792, + "grad_norm": 7.5614519119262695, + "kl": 0.1884765625, + "learning_rate": 9.103999999999999e-07, + "loss": 0.0075, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1344 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.625, + "epoch": 0.17933333333333334, + "grad_norm": 21.764892578125, + "kl": 0.2548828125, + "learning_rate": 9.103333333333333e-07, + "loss": 0.0102, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1345 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.8125, + "epoch": 0.17946666666666666, + "grad_norm": 12.253545761108398, + "kl": 0.3466796875, + "learning_rate": 9.102666666666666e-07, + "loss": 0.0139, + "reward": 1.125, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 1346 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.5625, + "epoch": 0.1796, + "grad_norm": 8.057282447814941, + "kl": 0.177734375, + "learning_rate": 9.102e-07, + "loss": 0.0071, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1347 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.8125, + "epoch": 0.17973333333333333, + "grad_norm": 13.148187637329102, + "kl": 0.35546875, + "learning_rate": 9.101333333333333e-07, + "loss": 0.0142, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1348 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.25, + "epoch": 0.17986666666666667, + "grad_norm": 1.7255090475082397, + "kl": 0.23876953125, + "learning_rate": 9.100666666666667e-07, + "loss": 0.0095, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1349 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.9375, + "epoch": 0.18, + "grad_norm": 21.59052848815918, + "kl": 0.14794921875, + "learning_rate": 9.1e-07, + "loss": 0.0059, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1350 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.6875, + "epoch": 0.18013333333333334, + "grad_norm": 10.259637832641602, + "kl": 0.1787109375, + "learning_rate": 9.099333333333334e-07, + "loss": 0.0072, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1351 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.4375, + "epoch": 0.18026666666666666, + "grad_norm": 8.607843399047852, + "kl": 0.14208984375, + "learning_rate": 9.098666666666667e-07, + "loss": 0.0057, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1352 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.9375, + "epoch": 0.1804, + "grad_norm": 120.79493713378906, + "kl": 0.1728515625, + "learning_rate": 9.098000000000001e-07, + "loss": 0.0069, + "reward": 1.25, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 1353 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.25, + "epoch": 0.18053333333333332, + "grad_norm": 38.527889251708984, + "kl": 0.1533203125, + "learning_rate": 9.097333333333332e-07, + "loss": 0.0061, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1354 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.0625, + "epoch": 0.18066666666666667, + "grad_norm": 15.92745304107666, + "kl": 0.189697265625, + "learning_rate": 9.096666666666665e-07, + "loss": 0.0076, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 1355 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.625, + "epoch": 0.1808, + "grad_norm": 6.563441753387451, + "kl": 0.19775390625, + "learning_rate": 9.095999999999999e-07, + "loss": 0.0079, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1356 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.6875, + "epoch": 0.18093333333333333, + "grad_norm": 12.243468284606934, + "kl": 0.236328125, + "learning_rate": 9.095333333333332e-07, + "loss": 0.0095, + "reward": 1.625, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 1357 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.3125, + "epoch": 0.18106666666666665, + "grad_norm": 8.939661026000977, + "kl": 0.24755859375, + "learning_rate": 9.094666666666666e-07, + "loss": 0.0099, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 1358 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.25, + "epoch": 0.1812, + "grad_norm": 7.980691432952881, + "kl": 0.24755859375, + "learning_rate": 9.093999999999999e-07, + "loss": 0.0099, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1359 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.625, + "epoch": 0.18133333333333335, + "grad_norm": 12.047783851623535, + "kl": 0.3193359375, + "learning_rate": 9.093333333333333e-07, + "loss": 0.0128, + "reward": 1.5625, + "reward_std": 0.7499763667583466, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 1360 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.6875, + "epoch": 0.18146666666666667, + "grad_norm": 8.382646560668945, + "kl": 0.2021484375, + "learning_rate": 9.092666666666666e-07, + "loss": 0.0081, + "reward": 1.125, + "reward_std": 0.5940381735563278, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.875, + "step": 1361 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.375, + "epoch": 0.1816, + "grad_norm": 14.217604637145996, + "kl": 0.216796875, + "learning_rate": 9.092e-07, + "loss": 0.0087, + "reward": 1.375, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1362 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.25, + "epoch": 0.18173333333333333, + "grad_norm": 0.7284461259841919, + "kl": 0.1962890625, + "learning_rate": 9.091333333333333e-07, + "loss": 0.0079, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 1363 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.625, + "epoch": 0.18186666666666668, + "grad_norm": 9.420584678649902, + "kl": 0.2783203125, + "learning_rate": 9.090666666666666e-07, + "loss": 0.0111, + "reward": 1.6875, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 1364 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.125, + "epoch": 0.182, + "grad_norm": 12.687766075134277, + "kl": 0.822265625, + "learning_rate": 9.09e-07, + "loss": 0.0328, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 1365 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.375, + "epoch": 0.18213333333333334, + "grad_norm": 13.433035850524902, + "kl": 0.365234375, + "learning_rate": 9.089333333333333e-07, + "loss": 0.0146, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1366 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.75, + "epoch": 0.18226666666666666, + "grad_norm": 6.548002243041992, + "kl": 0.27197265625, + "learning_rate": 9.088666666666667e-07, + "loss": 0.0109, + "reward": 0.9375, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.875, + "step": 1367 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.25, + "epoch": 0.1824, + "grad_norm": 19.77490997314453, + "kl": 0.4306640625, + "learning_rate": 9.088e-07, + "loss": 0.0172, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1368 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.5625, + "epoch": 0.18253333333333333, + "grad_norm": 9.177395820617676, + "kl": 0.26611328125, + "learning_rate": 9.087333333333333e-07, + "loss": 0.0107, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1369 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.375, + "epoch": 0.18266666666666667, + "grad_norm": 9.451336860656738, + "kl": 0.18798828125, + "learning_rate": 9.086666666666666e-07, + "loss": 0.0075, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1370 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.3125, + "epoch": 0.1828, + "grad_norm": 5.127388954162598, + "kl": 0.2333984375, + "learning_rate": 9.086e-07, + "loss": 0.0094, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1371 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.0, + "epoch": 0.18293333333333334, + "grad_norm": 10.282869338989258, + "kl": 0.251953125, + "learning_rate": 9.085333333333333e-07, + "loss": 0.0101, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1372 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.6875, + "epoch": 0.18306666666666666, + "grad_norm": 7.412274360656738, + "kl": 0.21875, + "learning_rate": 9.084666666666666e-07, + "loss": 0.0087, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1373 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.4375, + "epoch": 0.1832, + "grad_norm": 5.431008815765381, + "kl": 0.29248046875, + "learning_rate": 9.084e-07, + "loss": 0.0117, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 1374 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.9375, + "epoch": 0.18333333333333332, + "grad_norm": 12.200347900390625, + "kl": 0.36328125, + "learning_rate": 9.083333333333332e-07, + "loss": 0.0145, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1375 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.875, + "epoch": 0.18346666666666667, + "grad_norm": 9.545533180236816, + "kl": 0.18017578125, + "learning_rate": 9.082666666666666e-07, + "loss": 0.0072, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1376 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.125, + "epoch": 0.1836, + "grad_norm": 7.14875602722168, + "kl": 0.3115234375, + "learning_rate": 9.081999999999999e-07, + "loss": 0.0125, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 1377 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.0, + "epoch": 0.18373333333333333, + "grad_norm": 9.807738304138184, + "kl": 0.21826171875, + "learning_rate": 9.081333333333333e-07, + "loss": 0.0087, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1378 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.625, + "epoch": 0.18386666666666668, + "grad_norm": 13.469864845275879, + "kl": 0.521484375, + "learning_rate": 9.080666666666666e-07, + "loss": 0.0209, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 1379 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.875, + "epoch": 0.184, + "grad_norm": 9.665650367736816, + "kl": 0.26025390625, + "learning_rate": 9.08e-07, + "loss": 0.0104, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1380 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.5, + "epoch": 0.18413333333333334, + "grad_norm": 12.541704177856445, + "kl": 0.279296875, + "learning_rate": 9.079333333333333e-07, + "loss": 0.0112, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1381 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.25, + "epoch": 0.18426666666666666, + "grad_norm": 11.812674522399902, + "kl": 0.193359375, + "learning_rate": 9.078666666666666e-07, + "loss": 0.0077, + "reward": 1.4375, + "reward_std": 0.7499763667583466, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 1382 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.5625, + "epoch": 0.1844, + "grad_norm": 9.133925437927246, + "kl": 0.17529296875, + "learning_rate": 9.078e-07, + "loss": 0.007, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1383 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.3125, + "epoch": 0.18453333333333333, + "grad_norm": 1.7101091146469116, + "kl": 0.34716796875, + "learning_rate": 9.077333333333332e-07, + "loss": 0.0139, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1384 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.0625, + "epoch": 0.18466666666666667, + "grad_norm": 7.617630481719971, + "kl": 0.24072265625, + "learning_rate": 9.076666666666666e-07, + "loss": 0.0096, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 1385 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.1875, + "epoch": 0.1848, + "grad_norm": 5.250242233276367, + "kl": 0.22314453125, + "learning_rate": 9.075999999999999e-07, + "loss": 0.0089, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 1386 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.625, + "epoch": 0.18493333333333334, + "grad_norm": 9.456345558166504, + "kl": 0.4404296875, + "learning_rate": 9.075333333333333e-07, + "loss": 0.0176, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 1387 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.25, + "epoch": 0.18506666666666666, + "grad_norm": 5.951539039611816, + "kl": 0.2255859375, + "learning_rate": 9.074666666666666e-07, + "loss": 0.009, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 1388 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.375, + "epoch": 0.1852, + "grad_norm": 8.209853172302246, + "kl": 0.20263671875, + "learning_rate": 9.074e-07, + "loss": 0.0081, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1389 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.125, + "epoch": 0.18533333333333332, + "grad_norm": 4.379199028015137, + "kl": 0.2314453125, + "learning_rate": 9.073333333333333e-07, + "loss": 0.0093, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 1390 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.375, + "epoch": 0.18546666666666667, + "grad_norm": 9.452452659606934, + "kl": 0.2763671875, + "learning_rate": 9.072666666666666e-07, + "loss": 0.0111, + "reward": 1.6875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.875, + "step": 1391 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.25, + "epoch": 0.1856, + "grad_norm": 10.204052925109863, + "kl": 0.3818359375, + "learning_rate": 9.072e-07, + "loss": 0.0153, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1392 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.3125, + "epoch": 0.18573333333333333, + "grad_norm": 4.921051502227783, + "kl": 0.236328125, + "learning_rate": 9.071333333333333e-07, + "loss": 0.0095, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 1393 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.1875, + "epoch": 0.18586666666666668, + "grad_norm": 5.9686784744262695, + "kl": 0.20263671875, + "learning_rate": 9.070666666666667e-07, + "loss": 0.0081, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1394 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.375, + "epoch": 0.186, + "grad_norm": 9.032541275024414, + "kl": 0.2490234375, + "learning_rate": 9.07e-07, + "loss": 0.01, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1395 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.8125, + "epoch": 0.18613333333333335, + "grad_norm": 0.36189043521881104, + "kl": 0.1494140625, + "learning_rate": 9.069333333333334e-07, + "loss": 0.006, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1396 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.5, + "epoch": 0.18626666666666666, + "grad_norm": 9.35237979888916, + "kl": 0.28466796875, + "learning_rate": 9.068666666666666e-07, + "loss": 0.0114, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1397 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.5, + "epoch": 0.1864, + "grad_norm": 6.273021697998047, + "kl": 0.16259765625, + "learning_rate": 9.068e-07, + "loss": 0.0065, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1398 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.6875, + "epoch": 0.18653333333333333, + "grad_norm": 8.314631462097168, + "kl": 0.1552734375, + "learning_rate": 9.067333333333332e-07, + "loss": 0.0062, + "reward": 1.125, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 1399 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.9375, + "epoch": 0.18666666666666668, + "grad_norm": 8.135419845581055, + "kl": 0.185546875, + "learning_rate": 9.066666666666665e-07, + "loss": 0.0074, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1400 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.3125, + "epoch": 0.1868, + "grad_norm": 8.833259582519531, + "kl": 0.21875, + "learning_rate": 9.065999999999999e-07, + "loss": 0.0088, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 1401 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.5, + "epoch": 0.18693333333333334, + "grad_norm": 7.674072265625, + "kl": 0.1513671875, + "learning_rate": 9.065333333333332e-07, + "loss": 0.0061, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1402 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.625, + "epoch": 0.18706666666666666, + "grad_norm": 7.915067195892334, + "kl": 0.18896484375, + "learning_rate": 9.064666666666666e-07, + "loss": 0.0076, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1403 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.25, + "epoch": 0.1872, + "grad_norm": 8.256525993347168, + "kl": 0.14990234375, + "learning_rate": 9.063999999999999e-07, + "loss": 0.006, + "reward": 1.25, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 1404 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.9375, + "epoch": 0.18733333333333332, + "grad_norm": 46.779354095458984, + "kl": 0.2275390625, + "learning_rate": 9.063333333333333e-07, + "loss": 0.0091, + "reward": 1.625, + "reward_std": 0.7315178513526917, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 1405 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.0625, + "epoch": 0.18746666666666667, + "grad_norm": 5.314449310302734, + "kl": 0.1552734375, + "learning_rate": 9.062666666666666e-07, + "loss": 0.0062, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1406 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.625, + "epoch": 0.1876, + "grad_norm": 58.753475189208984, + "kl": 0.15185546875, + "learning_rate": 9.062e-07, + "loss": 0.0061, + "reward": 1.0625, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 1407 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.5625, + "epoch": 0.18773333333333334, + "grad_norm": 5.426820278167725, + "kl": 0.167236328125, + "learning_rate": 9.061333333333333e-07, + "loss": 0.0067, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1408 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.0625, + "epoch": 0.18786666666666665, + "grad_norm": 4.467780113220215, + "kl": 0.19775390625, + "learning_rate": 9.060666666666667e-07, + "loss": 0.0079, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 1409 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.625, + "epoch": 0.188, + "grad_norm": 8.694202423095703, + "kl": 0.20556640625, + "learning_rate": 9.06e-07, + "loss": 0.0082, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1410 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.1875, + "epoch": 0.18813333333333335, + "grad_norm": 10.872429847717285, + "kl": 0.16845703125, + "learning_rate": 9.059333333333333e-07, + "loss": 0.0067, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 1411 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.875, + "epoch": 0.18826666666666667, + "grad_norm": 9.44503402709961, + "kl": 0.19970703125, + "learning_rate": 9.058666666666667e-07, + "loss": 0.008, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1412 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.3125, + "epoch": 0.1884, + "grad_norm": 6.65391731262207, + "kl": 0.2548828125, + "learning_rate": 9.058e-07, + "loss": 0.0102, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1413 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.6875, + "epoch": 0.18853333333333333, + "grad_norm": 19.297428131103516, + "kl": 0.89208984375, + "learning_rate": 9.057333333333333e-07, + "loss": 0.0357, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 1414 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.5625, + "epoch": 0.18866666666666668, + "grad_norm": 7.849637031555176, + "kl": 0.20263671875, + "learning_rate": 9.056666666666666e-07, + "loss": 0.0081, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1415 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.0625, + "epoch": 0.1888, + "grad_norm": 10.938496589660645, + "kl": 0.34765625, + "learning_rate": 9.056e-07, + "loss": 0.0139, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 1416 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.0625, + "epoch": 0.18893333333333334, + "grad_norm": 7.6751556396484375, + "kl": 0.3037109375, + "learning_rate": 9.055333333333332e-07, + "loss": 0.0122, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1417 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.375, + "epoch": 0.18906666666666666, + "grad_norm": 0.5600578188896179, + "kl": 0.25732421875, + "learning_rate": 9.054666666666666e-07, + "loss": 0.0103, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1418 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.3125, + "epoch": 0.1892, + "grad_norm": 5.134878635406494, + "kl": 0.234375, + "learning_rate": 9.053999999999999e-07, + "loss": 0.0094, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1419 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.8125, + "epoch": 0.18933333333333333, + "grad_norm": 6.161218643188477, + "kl": 0.15673828125, + "learning_rate": 9.053333333333332e-07, + "loss": 0.0063, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1420 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.0625, + "epoch": 0.18946666666666667, + "grad_norm": 8.465239524841309, + "kl": 0.16943359375, + "learning_rate": 9.052666666666666e-07, + "loss": 0.0068, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1421 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.0, + "epoch": 0.1896, + "grad_norm": 5.549323558807373, + "kl": 0.132568359375, + "learning_rate": 9.051999999999999e-07, + "loss": 0.0053, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 1422 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.375, + "epoch": 0.18973333333333334, + "grad_norm": 7.612007141113281, + "kl": 0.20263671875, + "learning_rate": 9.051333333333333e-07, + "loss": 0.0081, + "reward": 1.5625, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 1423 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.25, + "epoch": 0.18986666666666666, + "grad_norm": 0.4686659276485443, + "kl": 0.23828125, + "learning_rate": 9.050666666666666e-07, + "loss": 0.0095, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1424 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.6875, + "epoch": 0.19, + "grad_norm": 0.435535728931427, + "kl": 0.134765625, + "learning_rate": 9.05e-07, + "loss": 0.0054, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1425 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.125, + "epoch": 0.19013333333333332, + "grad_norm": 9.311565399169922, + "kl": 0.3662109375, + "learning_rate": 9.049333333333333e-07, + "loss": 0.0147, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 1426 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.25, + "epoch": 0.19026666666666667, + "grad_norm": 5.690812110900879, + "kl": 0.13818359375, + "learning_rate": 9.048666666666667e-07, + "loss": 0.0055, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1427 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.125, + "epoch": 0.1904, + "grad_norm": 6.130938529968262, + "kl": 0.1953125, + "learning_rate": 9.048e-07, + "loss": 0.0078, + "reward": 1.0, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 1428 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.9375, + "epoch": 0.19053333333333333, + "grad_norm": 0.49935656785964966, + "kl": 0.14208984375, + "learning_rate": 9.047333333333332e-07, + "loss": 0.0057, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1429 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.3125, + "epoch": 0.19066666666666668, + "grad_norm": 7.4752326011657715, + "kl": 0.228515625, + "learning_rate": 9.046666666666666e-07, + "loss": 0.0091, + "reward": 1.25, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 1430 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.3125, + "epoch": 0.1908, + "grad_norm": 12.678033828735352, + "kl": 0.25341796875, + "learning_rate": 9.045999999999999e-07, + "loss": 0.0101, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1431 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.125, + "epoch": 0.19093333333333334, + "grad_norm": 1.388452410697937, + "kl": 0.21240234375, + "learning_rate": 9.045333333333333e-07, + "loss": 0.0085, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1432 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.4375, + "epoch": 0.19106666666666666, + "grad_norm": 8.032010078430176, + "kl": 0.203125, + "learning_rate": 9.044666666666666e-07, + "loss": 0.0081, + "reward": 1.1875, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 1433 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.5, + "epoch": 0.1912, + "grad_norm": 4.583353519439697, + "kl": 0.18017578125, + "learning_rate": 9.044e-07, + "loss": 0.0072, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1434 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.75, + "epoch": 0.19133333333333333, + "grad_norm": 8.152442932128906, + "kl": 0.15771484375, + "learning_rate": 9.043333333333333e-07, + "loss": 0.0063, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1435 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.375, + "epoch": 0.19146666666666667, + "grad_norm": 8.20263385772705, + "kl": 0.21728515625, + "learning_rate": 9.042666666666667e-07, + "loss": 0.0087, + "reward": 1.25, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 1436 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.125, + "epoch": 0.1916, + "grad_norm": 9.286725997924805, + "kl": 0.2021484375, + "learning_rate": 9.042e-07, + "loss": 0.0081, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1437 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.125, + "epoch": 0.19173333333333334, + "grad_norm": 4.809001922607422, + "kl": 0.16455078125, + "learning_rate": 9.041333333333334e-07, + "loss": 0.0066, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1438 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.5625, + "epoch": 0.19186666666666666, + "grad_norm": 8.030376434326172, + "kl": 0.16796875, + "learning_rate": 9.040666666666666e-07, + "loss": 0.0067, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1439 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.3125, + "epoch": 0.192, + "grad_norm": 10.75333023071289, + "kl": 0.13818359375, + "learning_rate": 9.039999999999999e-07, + "loss": 0.0055, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1440 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.375, + "epoch": 0.19213333333333332, + "grad_norm": 4.502605438232422, + "kl": 0.2939453125, + "learning_rate": 9.039333333333333e-07, + "loss": 0.0118, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1441 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.1875, + "epoch": 0.19226666666666667, + "grad_norm": 6.154731273651123, + "kl": 0.20751953125, + "learning_rate": 9.038666666666666e-07, + "loss": 0.0083, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1442 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.4375, + "epoch": 0.1924, + "grad_norm": 5.605429649353027, + "kl": 0.162109375, + "learning_rate": 9.038e-07, + "loss": 0.0065, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1443 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.9375, + "epoch": 0.19253333333333333, + "grad_norm": 0.6574320793151855, + "kl": 0.3251953125, + "learning_rate": 9.037333333333333e-07, + "loss": 0.013, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1444 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.1875, + "epoch": 0.19266666666666668, + "grad_norm": 6.490789413452148, + "kl": 0.357421875, + "learning_rate": 9.036666666666666e-07, + "loss": 0.0143, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1445 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.25, + "epoch": 0.1928, + "grad_norm": 8.740187644958496, + "kl": 0.298828125, + "learning_rate": 9.035999999999999e-07, + "loss": 0.0119, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 1446 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.4375, + "epoch": 0.19293333333333335, + "grad_norm": 6.968545436859131, + "kl": 0.166015625, + "learning_rate": 9.035333333333333e-07, + "loss": 0.0066, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1447 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.0625, + "epoch": 0.19306666666666666, + "grad_norm": 4.143121242523193, + "kl": 0.19921875, + "learning_rate": 9.034666666666666e-07, + "loss": 0.008, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 1448 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.5625, + "epoch": 0.1932, + "grad_norm": 8.18497371673584, + "kl": 0.248046875, + "learning_rate": 9.033999999999999e-07, + "loss": 0.0099, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1449 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.0625, + "epoch": 0.19333333333333333, + "grad_norm": 7.427906036376953, + "kl": 0.14892578125, + "learning_rate": 9.033333333333333e-07, + "loss": 0.006, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1450 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.6875, + "epoch": 0.19346666666666668, + "grad_norm": 6.307835578918457, + "kl": 0.189453125, + "learning_rate": 9.032666666666666e-07, + "loss": 0.0076, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1451 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.4375, + "epoch": 0.1936, + "grad_norm": 7.090723037719727, + "kl": 0.1435546875, + "learning_rate": 9.032e-07, + "loss": 0.0057, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1452 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.1875, + "epoch": 0.19373333333333334, + "grad_norm": 7.7259602546691895, + "kl": 0.1962890625, + "learning_rate": 9.031333333333333e-07, + "loss": 0.0079, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1453 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.6875, + "epoch": 0.19386666666666666, + "grad_norm": 0.8777545094490051, + "kl": 0.19775390625, + "learning_rate": 9.030666666666667e-07, + "loss": 0.0079, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 1454 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.0625, + "epoch": 0.194, + "grad_norm": 7.866794586181641, + "kl": 0.28271484375, + "learning_rate": 9.03e-07, + "loss": 0.0113, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1455 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.625, + "epoch": 0.19413333333333332, + "grad_norm": 0.36713650822639465, + "kl": 0.18359375, + "learning_rate": 9.029333333333334e-07, + "loss": 0.0074, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1456 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.75, + "epoch": 0.19426666666666667, + "grad_norm": 4.234005928039551, + "kl": 0.1796875, + "learning_rate": 9.028666666666667e-07, + "loss": 0.0072, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 1457 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.5625, + "epoch": 0.1944, + "grad_norm": 8.492636680603027, + "kl": 0.18603515625, + "learning_rate": 9.028e-07, + "loss": 0.0075, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1458 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.25, + "epoch": 0.19453333333333334, + "grad_norm": 3.647982120513916, + "kl": 0.18603515625, + "learning_rate": 9.027333333333334e-07, + "loss": 0.0074, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 1459 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.5625, + "epoch": 0.19466666666666665, + "grad_norm": 7.62669563293457, + "kl": 0.18408203125, + "learning_rate": 9.026666666666665e-07, + "loss": 0.0074, + "reward": 1.3125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 1460 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.125, + "epoch": 0.1948, + "grad_norm": 4.99639892578125, + "kl": 0.259765625, + "learning_rate": 9.025999999999999e-07, + "loss": 0.0104, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1461 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.8125, + "epoch": 0.19493333333333332, + "grad_norm": 12.410894393920898, + "kl": 0.208984375, + "learning_rate": 9.025333333333332e-07, + "loss": 0.0084, + "reward": 1.0, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.875, + "step": 1462 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.5625, + "epoch": 0.19506666666666667, + "grad_norm": 7.910408020019531, + "kl": 0.22412109375, + "learning_rate": 9.024666666666666e-07, + "loss": 0.009, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1463 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.5, + "epoch": 0.1952, + "grad_norm": 17.558040618896484, + "kl": 0.2490234375, + "learning_rate": 9.023999999999999e-07, + "loss": 0.0099, + "reward": 1.3125, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 1464 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.9375, + "epoch": 0.19533333333333333, + "grad_norm": 9.894105911254883, + "kl": 0.17529296875, + "learning_rate": 9.023333333333333e-07, + "loss": 0.007, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1465 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.8125, + "epoch": 0.19546666666666668, + "grad_norm": 0.4442465007305145, + "kl": 0.1767578125, + "learning_rate": 9.022666666666666e-07, + "loss": 0.0071, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1466 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.75, + "epoch": 0.1956, + "grad_norm": 6.498806476593018, + "kl": 0.322265625, + "learning_rate": 9.021999999999999e-07, + "loss": 0.0129, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 1467 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.25, + "epoch": 0.19573333333333334, + "grad_norm": 4.977087497711182, + "kl": 0.136962890625, + "learning_rate": 9.021333333333333e-07, + "loss": 0.0055, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1468 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.0625, + "epoch": 0.19586666666666666, + "grad_norm": 10.697123527526855, + "kl": 0.18359375, + "learning_rate": 9.020666666666666e-07, + "loss": 0.0073, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1469 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.4375, + "epoch": 0.196, + "grad_norm": 9.133284568786621, + "kl": 0.26171875, + "learning_rate": 9.02e-07, + "loss": 0.0105, + "reward": 1.375, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1470 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.8125, + "epoch": 0.19613333333333333, + "grad_norm": 4.88038969039917, + "kl": 0.1953125, + "learning_rate": 9.019333333333333e-07, + "loss": 0.0078, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1471 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.6875, + "epoch": 0.19626666666666667, + "grad_norm": 27.714248657226562, + "kl": 1.046875, + "learning_rate": 9.018666666666667e-07, + "loss": 0.0419, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1472 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.875, + "epoch": 0.1964, + "grad_norm": 9.95139217376709, + "kl": 0.28515625, + "learning_rate": 9.018e-07, + "loss": 0.0114, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1473 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.0625, + "epoch": 0.19653333333333334, + "grad_norm": 0.5134475231170654, + "kl": 0.35791015625, + "learning_rate": 9.017333333333334e-07, + "loss": 0.0143, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 1474 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.4375, + "epoch": 0.19666666666666666, + "grad_norm": 8.515801429748535, + "kl": 0.24853515625, + "learning_rate": 9.016666666666666e-07, + "loss": 0.0099, + "reward": 1.3125, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 1475 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.875, + "epoch": 0.1968, + "grad_norm": 4.614971160888672, + "kl": 0.3994140625, + "learning_rate": 9.015999999999999e-07, + "loss": 0.016, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 1476 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.0625, + "epoch": 0.19693333333333332, + "grad_norm": 7.5622148513793945, + "kl": 0.2890625, + "learning_rate": 9.015333333333333e-07, + "loss": 0.0116, + "reward": 1.0625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 1477 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.0, + "epoch": 0.19706666666666667, + "grad_norm": 7.712429046630859, + "kl": 0.365234375, + "learning_rate": 9.014666666666666e-07, + "loss": 0.0146, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1478 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.4375, + "epoch": 0.1972, + "grad_norm": 4.430418968200684, + "kl": 0.3681640625, + "learning_rate": 9.014e-07, + "loss": 0.0147, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1479 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.3125, + "epoch": 0.19733333333333333, + "grad_norm": 13.034065246582031, + "kl": 0.775390625, + "learning_rate": 9.013333333333333e-07, + "loss": 0.0311, + "reward": 1.125, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 1480 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.5, + "epoch": 0.19746666666666668, + "grad_norm": 10.289177894592285, + "kl": 0.2294921875, + "learning_rate": 9.012666666666667e-07, + "loss": 0.0092, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1481 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.0625, + "epoch": 0.1976, + "grad_norm": 15.268174171447754, + "kl": 0.3408203125, + "learning_rate": 9.011999999999999e-07, + "loss": 0.0136, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1482 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.375, + "epoch": 0.19773333333333334, + "grad_norm": 7.777111053466797, + "kl": 0.19921875, + "learning_rate": 9.011333333333333e-07, + "loss": 0.008, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1483 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.5, + "epoch": 0.19786666666666666, + "grad_norm": 10.585152626037598, + "kl": 0.2216796875, + "learning_rate": 9.010666666666666e-07, + "loss": 0.0089, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 1484 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.0, + "epoch": 0.198, + "grad_norm": 8.981874465942383, + "kl": 0.33203125, + "learning_rate": 9.01e-07, + "loss": 0.0133, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1485 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.3125, + "epoch": 0.19813333333333333, + "grad_norm": 0.6787276864051819, + "kl": 0.37109375, + "learning_rate": 9.009333333333333e-07, + "loss": 0.0148, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1486 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.5, + "epoch": 0.19826666666666667, + "grad_norm": 47.38201141357422, + "kl": 2.7275390625, + "learning_rate": 9.008666666666666e-07, + "loss": 0.1089, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 1487 + }, + { + "clip_ratio": 0.0, + "completion_length": 28.375, + "epoch": 0.1984, + "grad_norm": 10.103165626525879, + "kl": 0.53125, + "learning_rate": 9.008e-07, + "loss": 0.0212, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1488 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.5625, + "epoch": 0.19853333333333334, + "grad_norm": 8.783666610717773, + "kl": 0.37890625, + "learning_rate": 9.007333333333333e-07, + "loss": 0.0152, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 1489 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.125, + "epoch": 0.19866666666666666, + "grad_norm": 9.328251838684082, + "kl": 0.4375, + "learning_rate": 9.006666666666666e-07, + "loss": 0.0175, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1490 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.0, + "epoch": 0.1988, + "grad_norm": 12.182361602783203, + "kl": 0.5, + "learning_rate": 9.005999999999999e-07, + "loss": 0.02, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1491 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.25, + "epoch": 0.19893333333333332, + "grad_norm": 0.6004683375358582, + "kl": 0.38671875, + "learning_rate": 9.005333333333333e-07, + "loss": 0.0155, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1492 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.875, + "epoch": 0.19906666666666667, + "grad_norm": 12.147323608398438, + "kl": 0.4013671875, + "learning_rate": 9.004666666666666e-07, + "loss": 0.0161, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1493 + }, + { + "clip_ratio": 0.0, + "completion_length": 46.0, + "epoch": 0.1992, + "grad_norm": 12.520265579223633, + "kl": 0.705078125, + "learning_rate": 9.004e-07, + "loss": 0.0281, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1494 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.5625, + "epoch": 0.19933333333333333, + "grad_norm": 9.581136703491211, + "kl": 0.423828125, + "learning_rate": 9.003333333333333e-07, + "loss": 0.017, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1495 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.25, + "epoch": 0.19946666666666665, + "grad_norm": 12.076226234436035, + "kl": 0.498046875, + "learning_rate": 9.002666666666666e-07, + "loss": 0.0199, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 1496 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.875, + "epoch": 0.1996, + "grad_norm": 10.40676498413086, + "kl": 0.3447265625, + "learning_rate": 9.002e-07, + "loss": 0.0138, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1497 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.3125, + "epoch": 0.19973333333333335, + "grad_norm": 8.411348342895508, + "kl": 0.470703125, + "learning_rate": 9.001333333333333e-07, + "loss": 0.0188, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 1498 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.4375, + "epoch": 0.19986666666666666, + "grad_norm": 13.905926704406738, + "kl": 0.51953125, + "learning_rate": 9.000666666666667e-07, + "loss": 0.0208, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1499 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.75, + "epoch": 0.2, + "grad_norm": 5.052263259887695, + "kl": 0.59765625, + "learning_rate": 9e-07, + "loss": 0.0239, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 1500 + }, + { + "clip_ratio": 0.0, + "completion_length": 16.125, + "epoch": 0.20013333333333333, + "grad_norm": 24.51689910888672, + "kl": 0.7265625, + "learning_rate": 8.999333333333334e-07, + "loss": 0.0291, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1501 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.375, + "epoch": 0.20026666666666668, + "grad_norm": 12.635889053344727, + "kl": 0.8193359375, + "learning_rate": 8.998666666666667e-07, + "loss": 0.0327, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 1502 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.6875, + "epoch": 0.2004, + "grad_norm": 5.5517778396606445, + "kl": 0.6875, + "learning_rate": 8.998e-07, + "loss": 0.0275, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 1503 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.0, + "epoch": 0.20053333333333334, + "grad_norm": 4.843735694885254, + "kl": 0.400390625, + "learning_rate": 8.997333333333333e-07, + "loss": 0.016, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1504 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.4375, + "epoch": 0.20066666666666666, + "grad_norm": 13.336478233337402, + "kl": 0.306640625, + "learning_rate": 8.996666666666665e-07, + "loss": 0.0123, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1505 + }, + { + "clip_ratio": 0.0, + "completion_length": 42.5, + "epoch": 0.2008, + "grad_norm": 16.590229034423828, + "kl": 0.638671875, + "learning_rate": 8.995999999999999e-07, + "loss": 0.0256, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1506 + }, + { + "clip_ratio": 0.0, + "completion_length": 24.0625, + "epoch": 0.20093333333333332, + "grad_norm": 9.620896339416504, + "kl": 0.640625, + "learning_rate": 8.995333333333332e-07, + "loss": 0.0256, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1507 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.4375, + "epoch": 0.20106666666666667, + "grad_norm": 9.131867408752441, + "kl": 0.3876953125, + "learning_rate": 8.994666666666666e-07, + "loss": 0.0155, + "reward": 1.5625, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 1508 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5625, + "epoch": 0.2012, + "grad_norm": 17.274059295654297, + "kl": 0.4560546875, + "learning_rate": 8.993999999999999e-07, + "loss": 0.0183, + "reward": 1.1875, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 1509 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.1875, + "epoch": 0.20133333333333334, + "grad_norm": 5.922941207885742, + "kl": 0.466796875, + "learning_rate": 8.993333333333333e-07, + "loss": 0.0187, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1510 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.1875, + "epoch": 0.20146666666666666, + "grad_norm": 53.89174270629883, + "kl": 0.7587890625, + "learning_rate": 8.992666666666666e-07, + "loss": 0.0304, + "reward": 1.0625, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 1511 + }, + { + "clip_ratio": 0.0, + "completion_length": 16.8125, + "epoch": 0.2016, + "grad_norm": 50.80009841918945, + "kl": 0.91015625, + "learning_rate": 8.992e-07, + "loss": 0.0364, + "reward": 1.6875, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 1512 + }, + { + "clip_ratio": 0.0, + "completion_length": 31.875, + "epoch": 0.20173333333333332, + "grad_norm": 17.268095016479492, + "kl": 0.5078125, + "learning_rate": 8.991333333333333e-07, + "loss": 0.0204, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 1513 + }, + { + "clip_ratio": 0.0, + "completion_length": 24.4375, + "epoch": 0.20186666666666667, + "grad_norm": 16.93692970275879, + "kl": 0.83984375, + "learning_rate": 8.990666666666666e-07, + "loss": 0.0336, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1514 + }, + { + "clip_ratio": 0.0, + "completion_length": 13.1875, + "epoch": 0.202, + "grad_norm": 0.8912865519523621, + "kl": 0.798828125, + "learning_rate": 8.99e-07, + "loss": 0.032, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1515 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.3125, + "epoch": 0.20213333333333333, + "grad_norm": 10.556771278381348, + "kl": 0.48046875, + "learning_rate": 8.989333333333333e-07, + "loss": 0.0193, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1516 + }, + { + "clip_ratio": 0.0, + "completion_length": 17.375, + "epoch": 0.20226666666666668, + "grad_norm": 13.035178184509277, + "kl": 0.884765625, + "learning_rate": 8.988666666666667e-07, + "loss": 0.0354, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1517 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.8125, + "epoch": 0.2024, + "grad_norm": 22.997116088867188, + "kl": 0.595703125, + "learning_rate": 8.988e-07, + "loss": 0.0238, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1518 + }, + { + "clip_ratio": 0.0, + "completion_length": 29.5, + "epoch": 0.20253333333333334, + "grad_norm": 9.2817964553833, + "kl": 0.712890625, + "learning_rate": 8.987333333333334e-07, + "loss": 0.0286, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1519 + }, + { + "clip_ratio": 0.0, + "completion_length": 31.125, + "epoch": 0.20266666666666666, + "grad_norm": 15.973094940185547, + "kl": 0.580078125, + "learning_rate": 8.986666666666666e-07, + "loss": 0.0232, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1520 + }, + { + "clip_ratio": 0.0, + "completion_length": 46.1875, + "epoch": 0.2028, + "grad_norm": 12.367013931274414, + "kl": 0.43896484375, + "learning_rate": 8.986e-07, + "loss": 0.0175, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1521 + }, + { + "clip_ratio": 0.0, + "completion_length": 28.375, + "epoch": 0.20293333333333333, + "grad_norm": 11.139179229736328, + "kl": 0.4853515625, + "learning_rate": 8.985333333333333e-07, + "loss": 0.0194, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1522 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.3125, + "epoch": 0.20306666666666667, + "grad_norm": 1.1671696901321411, + "kl": 0.802734375, + "learning_rate": 8.984666666666665e-07, + "loss": 0.032, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 1523 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.875, + "epoch": 0.2032, + "grad_norm": 8.486967086791992, + "kl": 0.6201171875, + "learning_rate": 8.983999999999999e-07, + "loss": 0.0248, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1524 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.0625, + "epoch": 0.20333333333333334, + "grad_norm": 14.023140907287598, + "kl": 0.3720703125, + "learning_rate": 8.983333333333332e-07, + "loss": 0.0149, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1525 + }, + { + "clip_ratio": 0.0, + "completion_length": 44.375, + "epoch": 0.20346666666666666, + "grad_norm": 2.708852529525757, + "kl": 0.5478515625, + "learning_rate": 8.982666666666666e-07, + "loss": 0.0219, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1526 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.875, + "epoch": 0.2036, + "grad_norm": 12.007070541381836, + "kl": 0.2109375, + "learning_rate": 8.981999999999999e-07, + "loss": 0.0084, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1527 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.8125, + "epoch": 0.20373333333333332, + "grad_norm": 15.493085861206055, + "kl": 0.3525390625, + "learning_rate": 8.981333333333333e-07, + "loss": 0.0141, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 1528 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.6875, + "epoch": 0.20386666666666667, + "grad_norm": 5.75382137298584, + "kl": 0.2900390625, + "learning_rate": 8.980666666666666e-07, + "loss": 0.0116, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1529 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.375, + "epoch": 0.204, + "grad_norm": 10.55663776397705, + "kl": 0.3125, + "learning_rate": 8.98e-07, + "loss": 0.0125, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 1530 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.75, + "epoch": 0.20413333333333333, + "grad_norm": 5.457599639892578, + "kl": 0.166015625, + "learning_rate": 8.979333333333333e-07, + "loss": 0.0066, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 1531 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.875, + "epoch": 0.20426666666666668, + "grad_norm": 10.567122459411621, + "kl": 0.25390625, + "learning_rate": 8.978666666666667e-07, + "loss": 0.0102, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1532 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.25, + "epoch": 0.2044, + "grad_norm": 47.85872268676758, + "kl": 0.365234375, + "learning_rate": 8.978e-07, + "loss": 0.0146, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1533 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.1875, + "epoch": 0.20453333333333334, + "grad_norm": 0.3491469919681549, + "kl": 0.1328125, + "learning_rate": 8.977333333333333e-07, + "loss": 0.0053, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1534 + }, + { + "clip_ratio": 0.0, + "completion_length": 48.9375, + "epoch": 0.20466666666666666, + "grad_norm": 11.072067260742188, + "kl": 0.26611328125, + "learning_rate": 8.976666666666666e-07, + "loss": 0.0106, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1535 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.5625, + "epoch": 0.2048, + "grad_norm": 7.856949806213379, + "kl": 0.314453125, + "learning_rate": 8.975999999999999e-07, + "loss": 0.0126, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1536 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.0, + "epoch": 0.20493333333333333, + "grad_norm": 7.10308837890625, + "kl": 0.2373046875, + "learning_rate": 8.975333333333333e-07, + "loss": 0.0095, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 1537 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.5, + "epoch": 0.20506666666666667, + "grad_norm": 11.953606605529785, + "kl": 1.0166015625, + "learning_rate": 8.974666666666666e-07, + "loss": 0.0406, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1538 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.625, + "epoch": 0.2052, + "grad_norm": 0.6684650182723999, + "kl": 0.267578125, + "learning_rate": 8.974e-07, + "loss": 0.0107, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1539 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.0, + "epoch": 0.20533333333333334, + "grad_norm": 75.6776351928711, + "kl": 0.33251953125, + "learning_rate": 8.973333333333333e-07, + "loss": 0.0133, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1540 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.625, + "epoch": 0.20546666666666666, + "grad_norm": 10.293306350708008, + "kl": 0.27099609375, + "learning_rate": 8.972666666666667e-07, + "loss": 0.0108, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1541 + }, + { + "clip_ratio": 0.0, + "completion_length": 52.25, + "epoch": 0.2056, + "grad_norm": 0.7797024250030518, + "kl": 0.39453125, + "learning_rate": 8.972e-07, + "loss": 0.0158, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1542 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.5, + "epoch": 0.20573333333333332, + "grad_norm": 3.8659732341766357, + "kl": 0.20849609375, + "learning_rate": 8.971333333333333e-07, + "loss": 0.0083, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1543 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.4375, + "epoch": 0.20586666666666667, + "grad_norm": 10.617049217224121, + "kl": 0.23486328125, + "learning_rate": 8.970666666666667e-07, + "loss": 0.0094, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1544 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.1875, + "epoch": 0.206, + "grad_norm": 10.491619110107422, + "kl": 0.251953125, + "learning_rate": 8.969999999999999e-07, + "loss": 0.0101, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1545 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.0, + "epoch": 0.20613333333333334, + "grad_norm": 7.2223944664001465, + "kl": 0.21923828125, + "learning_rate": 8.969333333333333e-07, + "loss": 0.0088, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1546 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.875, + "epoch": 0.20626666666666665, + "grad_norm": 8.263495445251465, + "kl": 0.30078125, + "learning_rate": 8.968666666666666e-07, + "loss": 0.012, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 1547 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.375, + "epoch": 0.2064, + "grad_norm": 11.304296493530273, + "kl": 0.3642578125, + "learning_rate": 8.968e-07, + "loss": 0.0145, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1548 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.1875, + "epoch": 0.20653333333333335, + "grad_norm": 8.802545547485352, + "kl": 0.2802734375, + "learning_rate": 8.967333333333333e-07, + "loss": 0.0112, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1549 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.5625, + "epoch": 0.20666666666666667, + "grad_norm": 0.5357996821403503, + "kl": 0.251953125, + "learning_rate": 8.966666666666666e-07, + "loss": 0.0101, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1550 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.625, + "epoch": 0.2068, + "grad_norm": 10.698090553283691, + "kl": 0.2509765625, + "learning_rate": 8.965999999999999e-07, + "loss": 0.01, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1551 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.5, + "epoch": 0.20693333333333333, + "grad_norm": 0.442286878824234, + "kl": 0.19921875, + "learning_rate": 8.965333333333332e-07, + "loss": 0.008, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 1552 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.6875, + "epoch": 0.20706666666666668, + "grad_norm": 9.8570556640625, + "kl": 0.259765625, + "learning_rate": 8.964666666666666e-07, + "loss": 0.0104, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1553 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.5, + "epoch": 0.2072, + "grad_norm": 6.0315680503845215, + "kl": 0.1787109375, + "learning_rate": 8.963999999999999e-07, + "loss": 0.0072, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1554 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.25, + "epoch": 0.20733333333333334, + "grad_norm": 8.534369468688965, + "kl": 0.290771484375, + "learning_rate": 8.963333333333333e-07, + "loss": 0.0116, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1555 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.9375, + "epoch": 0.20746666666666666, + "grad_norm": 7.357687473297119, + "kl": 0.15283203125, + "learning_rate": 8.962666666666666e-07, + "loss": 0.0061, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1556 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.0, + "epoch": 0.2076, + "grad_norm": 5.8580098152160645, + "kl": 0.24658203125, + "learning_rate": 8.962e-07, + "loss": 0.0099, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1557 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.625, + "epoch": 0.20773333333333333, + "grad_norm": 4.757111549377441, + "kl": 0.1875, + "learning_rate": 8.961333333333333e-07, + "loss": 0.0075, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 1558 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.125, + "epoch": 0.20786666666666667, + "grad_norm": 9.861364364624023, + "kl": 0.24951171875, + "learning_rate": 8.960666666666667e-07, + "loss": 0.01, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1559 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.625, + "epoch": 0.208, + "grad_norm": 0.8373271822929382, + "kl": 0.23291015625, + "learning_rate": 8.96e-07, + "loss": 0.0093, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1560 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.25, + "epoch": 0.20813333333333334, + "grad_norm": 5.670338153839111, + "kl": 0.181884765625, + "learning_rate": 8.959333333333333e-07, + "loss": 0.0073, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 1561 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.5625, + "epoch": 0.20826666666666666, + "grad_norm": 6.389507293701172, + "kl": 0.16845703125, + "learning_rate": 8.958666666666667e-07, + "loss": 0.0068, + "reward": 1.375, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1562 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.5625, + "epoch": 0.2084, + "grad_norm": 7.186556339263916, + "kl": 0.18603515625, + "learning_rate": 8.958e-07, + "loss": 0.0074, + "reward": 1.3125, + "reward_std": 0.6739883720874786, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.8125, + "step": 1563 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.0625, + "epoch": 0.20853333333333332, + "grad_norm": 6.719399929046631, + "kl": 0.3876953125, + "learning_rate": 8.957333333333334e-07, + "loss": 0.0155, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 1564 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.8125, + "epoch": 0.20866666666666667, + "grad_norm": 8.023336410522461, + "kl": 0.1640625, + "learning_rate": 8.956666666666667e-07, + "loss": 0.0066, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1565 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.125, + "epoch": 0.2088, + "grad_norm": 6.717441082000732, + "kl": 0.16162109375, + "learning_rate": 8.955999999999999e-07, + "loss": 0.0064, + "reward": 1.3125, + "reward_std": 0.7499763667583466, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.8125, + "step": 1566 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.875, + "epoch": 0.20893333333333333, + "grad_norm": 5.381060600280762, + "kl": 0.171875, + "learning_rate": 8.955333333333332e-07, + "loss": 0.0069, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1567 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.0625, + "epoch": 0.20906666666666668, + "grad_norm": 7.066960334777832, + "kl": 0.2373046875, + "learning_rate": 8.954666666666666e-07, + "loss": 0.0095, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1568 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.9375, + "epoch": 0.2092, + "grad_norm": 5.566617965698242, + "kl": 0.19091796875, + "learning_rate": 8.953999999999999e-07, + "loss": 0.0076, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 1569 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.375, + "epoch": 0.20933333333333334, + "grad_norm": 17.438613891601562, + "kl": 0.1787109375, + "learning_rate": 8.953333333333332e-07, + "loss": 0.0071, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1570 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.375, + "epoch": 0.20946666666666666, + "grad_norm": 6.943342208862305, + "kl": 0.14501953125, + "learning_rate": 8.952666666666666e-07, + "loss": 0.0058, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1571 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.25, + "epoch": 0.2096, + "grad_norm": 4.91450834274292, + "kl": 0.23193359375, + "learning_rate": 8.951999999999999e-07, + "loss": 0.0093, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1572 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.125, + "epoch": 0.20973333333333333, + "grad_norm": 4.890664100646973, + "kl": 0.17333984375, + "learning_rate": 8.951333333333333e-07, + "loss": 0.0069, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1573 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.125, + "epoch": 0.20986666666666667, + "grad_norm": 4.941678524017334, + "kl": 0.20556640625, + "learning_rate": 8.950666666666666e-07, + "loss": 0.0082, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 1574 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.0625, + "epoch": 0.21, + "grad_norm": 2.7066891193389893, + "kl": 0.21337890625, + "learning_rate": 8.95e-07, + "loss": 0.0085, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 1575 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.75, + "epoch": 0.21013333333333334, + "grad_norm": 10.00333023071289, + "kl": 0.5439453125, + "learning_rate": 8.949333333333333e-07, + "loss": 0.0218, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1576 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.875, + "epoch": 0.21026666666666666, + "grad_norm": 7.789234161376953, + "kl": 0.1953125, + "learning_rate": 8.948666666666667e-07, + "loss": 0.0078, + "reward": 1.3125, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 1577 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.875, + "epoch": 0.2104, + "grad_norm": 9.984210968017578, + "kl": 0.244140625, + "learning_rate": 8.948e-07, + "loss": 0.0097, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 1578 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.625, + "epoch": 0.21053333333333332, + "grad_norm": 0.40386873483657837, + "kl": 0.1640625, + "learning_rate": 8.947333333333334e-07, + "loss": 0.0066, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1579 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.1875, + "epoch": 0.21066666666666667, + "grad_norm": 9.196869850158691, + "kl": 0.3017578125, + "learning_rate": 8.946666666666667e-07, + "loss": 0.0121, + "reward": 1.625, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 1580 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.3125, + "epoch": 0.2108, + "grad_norm": 70.19774627685547, + "kl": 0.1748046875, + "learning_rate": 8.945999999999999e-07, + "loss": 0.007, + "reward": 1.375, + "reward_std": 0.579209566116333, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1581 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.5, + "epoch": 0.21093333333333333, + "grad_norm": 6.845398426055908, + "kl": 0.17578125, + "learning_rate": 8.945333333333333e-07, + "loss": 0.007, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1582 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.0, + "epoch": 0.21106666666666668, + "grad_norm": 6.706015586853027, + "kl": 0.1494140625, + "learning_rate": 8.944666666666666e-07, + "loss": 0.006, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1583 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.9375, + "epoch": 0.2112, + "grad_norm": 14.03132438659668, + "kl": 0.220703125, + "learning_rate": 8.944e-07, + "loss": 0.0088, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 1584 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.5, + "epoch": 0.21133333333333335, + "grad_norm": 77.47827911376953, + "kl": 0.23779296875, + "learning_rate": 8.943333333333333e-07, + "loss": 0.0095, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 1585 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.0625, + "epoch": 0.21146666666666666, + "grad_norm": 4.832369327545166, + "kl": 0.298828125, + "learning_rate": 8.942666666666667e-07, + "loss": 0.0119, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1586 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.625, + "epoch": 0.2116, + "grad_norm": 205.42701721191406, + "kl": 0.9970703125, + "learning_rate": 8.941999999999999e-07, + "loss": 0.0398, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 1587 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.3125, + "epoch": 0.21173333333333333, + "grad_norm": 5.679851531982422, + "kl": 0.16015625, + "learning_rate": 8.941333333333333e-07, + "loss": 0.0064, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1588 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.375, + "epoch": 0.21186666666666668, + "grad_norm": 10.523520469665527, + "kl": 0.21435546875, + "learning_rate": 8.940666666666666e-07, + "loss": 0.0086, + "reward": 1.6875, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 1589 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.375, + "epoch": 0.212, + "grad_norm": 7.463749885559082, + "kl": 0.2744140625, + "learning_rate": 8.939999999999999e-07, + "loss": 0.011, + "reward": 1.6875, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.875, + "step": 1590 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.8125, + "epoch": 0.21213333333333334, + "grad_norm": 5.42851448059082, + "kl": 0.185546875, + "learning_rate": 8.939333333333333e-07, + "loss": 0.0074, + "reward": 1.6875, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.875, + "step": 1591 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.375, + "epoch": 0.21226666666666666, + "grad_norm": 6.53664493560791, + "kl": 0.23876953125, + "learning_rate": 8.938666666666666e-07, + "loss": 0.0096, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 1592 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.625, + "epoch": 0.2124, + "grad_norm": 5.64354133605957, + "kl": 0.2275390625, + "learning_rate": 8.938e-07, + "loss": 0.0091, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1593 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.6875, + "epoch": 0.21253333333333332, + "grad_norm": 8.894817352294922, + "kl": 0.26513671875, + "learning_rate": 8.937333333333333e-07, + "loss": 0.0106, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 1594 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.9375, + "epoch": 0.21266666666666667, + "grad_norm": 0.8895601630210876, + "kl": 0.2822265625, + "learning_rate": 8.936666666666667e-07, + "loss": 0.0113, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1595 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.25, + "epoch": 0.2128, + "grad_norm": 10.527530670166016, + "kl": 0.46923828125, + "learning_rate": 8.935999999999999e-07, + "loss": 0.0188, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 1596 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.4375, + "epoch": 0.21293333333333334, + "grad_norm": 0.6736980080604553, + "kl": 0.2333984375, + "learning_rate": 8.935333333333333e-07, + "loss": 0.0093, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1597 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.75, + "epoch": 0.21306666666666665, + "grad_norm": 4.909970760345459, + "kl": 0.24853515625, + "learning_rate": 8.934666666666666e-07, + "loss": 0.01, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 1598 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.4375, + "epoch": 0.2132, + "grad_norm": 7.3703413009643555, + "kl": 0.18603515625, + "learning_rate": 8.933999999999999e-07, + "loss": 0.0074, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1599 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.8125, + "epoch": 0.21333333333333335, + "grad_norm": 4.682952404022217, + "kl": 0.3388671875, + "learning_rate": 8.933333333333333e-07, + "loss": 0.0136, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 1600 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.4375, + "epoch": 0.21346666666666667, + "grad_norm": 9.713848114013672, + "kl": 0.23291015625, + "learning_rate": 8.932666666666666e-07, + "loss": 0.0093, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1601 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.875, + "epoch": 0.2136, + "grad_norm": 75.22003936767578, + "kl": 0.2841796875, + "learning_rate": 8.932e-07, + "loss": 0.0113, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1602 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.875, + "epoch": 0.21373333333333333, + "grad_norm": 0.6671133041381836, + "kl": 0.26708984375, + "learning_rate": 8.931333333333333e-07, + "loss": 0.0107, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1603 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.75, + "epoch": 0.21386666666666668, + "grad_norm": 4.4239091873168945, + "kl": 0.2958984375, + "learning_rate": 8.930666666666667e-07, + "loss": 0.0118, + "reward": 1.0625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 1604 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.0625, + "epoch": 0.214, + "grad_norm": 7.984082221984863, + "kl": 0.20068359375, + "learning_rate": 8.93e-07, + "loss": 0.008, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1605 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.9375, + "epoch": 0.21413333333333334, + "grad_norm": 0.36218371987342834, + "kl": 0.27099609375, + "learning_rate": 8.929333333333334e-07, + "loss": 0.0108, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1606 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.8125, + "epoch": 0.21426666666666666, + "grad_norm": 5.606235027313232, + "kl": 0.3095703125, + "learning_rate": 8.928666666666667e-07, + "loss": 0.0124, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1607 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.5625, + "epoch": 0.2144, + "grad_norm": 18.95551300048828, + "kl": 0.97265625, + "learning_rate": 8.928e-07, + "loss": 0.0389, + "reward": 1.1875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 1608 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.3125, + "epoch": 0.21453333333333333, + "grad_norm": 0.4315226674079895, + "kl": 0.240234375, + "learning_rate": 8.927333333333333e-07, + "loss": 0.0096, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1609 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.625, + "epoch": 0.21466666666666667, + "grad_norm": 4.756107807159424, + "kl": 0.2744140625, + "learning_rate": 8.926666666666666e-07, + "loss": 0.011, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1610 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.75, + "epoch": 0.2148, + "grad_norm": 7.724029064178467, + "kl": 0.19482421875, + "learning_rate": 8.925999999999999e-07, + "loss": 0.0078, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 1611 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.5, + "epoch": 0.21493333333333334, + "grad_norm": 4.708427429199219, + "kl": 0.23095703125, + "learning_rate": 8.925333333333332e-07, + "loss": 0.0092, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 1612 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.0625, + "epoch": 0.21506666666666666, + "grad_norm": 0.486320823431015, + "kl": 0.23828125, + "learning_rate": 8.924666666666666e-07, + "loss": 0.0095, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1613 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.8125, + "epoch": 0.2152, + "grad_norm": 5.596988201141357, + "kl": 0.18115234375, + "learning_rate": 8.923999999999999e-07, + "loss": 0.0073, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1614 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.25, + "epoch": 0.21533333333333332, + "grad_norm": 11.723512649536133, + "kl": 0.21923828125, + "learning_rate": 8.923333333333333e-07, + "loss": 0.0088, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1615 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.875, + "epoch": 0.21546666666666667, + "grad_norm": 11.411650657653809, + "kl": 0.306640625, + "learning_rate": 8.922666666666666e-07, + "loss": 0.0123, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1616 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.1875, + "epoch": 0.2156, + "grad_norm": 6.878583908081055, + "kl": 0.23583984375, + "learning_rate": 8.921999999999999e-07, + "loss": 0.0094, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1617 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.8125, + "epoch": 0.21573333333333333, + "grad_norm": 0.8746423125267029, + "kl": 0.21923828125, + "learning_rate": 8.921333333333333e-07, + "loss": 0.0088, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1618 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.125, + "epoch": 0.21586666666666668, + "grad_norm": 5.97312068939209, + "kl": 0.19970703125, + "learning_rate": 8.920666666666666e-07, + "loss": 0.008, + "reward": 1.125, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 1619 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.0625, + "epoch": 0.216, + "grad_norm": 11.551717758178711, + "kl": 0.337890625, + "learning_rate": 8.92e-07, + "loss": 0.0135, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1620 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.125, + "epoch": 0.21613333333333334, + "grad_norm": 5.197961807250977, + "kl": 0.2001953125, + "learning_rate": 8.919333333333333e-07, + "loss": 0.008, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 1621 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.8125, + "epoch": 0.21626666666666666, + "grad_norm": 8.01359748840332, + "kl": 0.31640625, + "learning_rate": 8.918666666666667e-07, + "loss": 0.0127, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1622 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.9375, + "epoch": 0.2164, + "grad_norm": 5.784873962402344, + "kl": 0.29931640625, + "learning_rate": 8.918e-07, + "loss": 0.012, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 1623 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.6875, + "epoch": 0.21653333333333333, + "grad_norm": 0.40078210830688477, + "kl": 0.208984375, + "learning_rate": 8.917333333333334e-07, + "loss": 0.0084, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1624 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.3125, + "epoch": 0.21666666666666667, + "grad_norm": 0.4244561493396759, + "kl": 0.23388671875, + "learning_rate": 8.916666666666667e-07, + "loss": 0.0094, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1625 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.1875, + "epoch": 0.2168, + "grad_norm": 18.34852409362793, + "kl": 0.24267578125, + "learning_rate": 8.915999999999999e-07, + "loss": 0.0097, + "reward": 1.1875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 1626 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.5625, + "epoch": 0.21693333333333334, + "grad_norm": 0.4267442226409912, + "kl": 0.2548828125, + "learning_rate": 8.915333333333333e-07, + "loss": 0.0102, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1627 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.4375, + "epoch": 0.21706666666666666, + "grad_norm": 6.400885105133057, + "kl": 0.2275390625, + "learning_rate": 8.914666666666665e-07, + "loss": 0.0091, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1628 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.6875, + "epoch": 0.2172, + "grad_norm": 16.950599670410156, + "kl": 0.26123046875, + "learning_rate": 8.914e-07, + "loss": 0.0104, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1629 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.5625, + "epoch": 0.21733333333333332, + "grad_norm": 10.568431854248047, + "kl": 0.255859375, + "learning_rate": 8.913333333333332e-07, + "loss": 0.0102, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 1630 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.9375, + "epoch": 0.21746666666666667, + "grad_norm": 10.82353401184082, + "kl": 0.5078125, + "learning_rate": 8.912666666666666e-07, + "loss": 0.0203, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 1631 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.3125, + "epoch": 0.2176, + "grad_norm": 5.381161212921143, + "kl": 0.21044921875, + "learning_rate": 8.911999999999999e-07, + "loss": 0.0084, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 1632 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.8125, + "epoch": 0.21773333333333333, + "grad_norm": 11.724624633789062, + "kl": 0.24267578125, + "learning_rate": 8.911333333333333e-07, + "loss": 0.0097, + "reward": 1.625, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 1633 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.6875, + "epoch": 0.21786666666666665, + "grad_norm": 0.48514366149902344, + "kl": 0.2978515625, + "learning_rate": 8.910666666666666e-07, + "loss": 0.0119, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1634 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.4375, + "epoch": 0.218, + "grad_norm": 9.121038436889648, + "kl": 0.33203125, + "learning_rate": 8.91e-07, + "loss": 0.0133, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1635 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.8125, + "epoch": 0.21813333333333335, + "grad_norm": 7.295797348022461, + "kl": 0.2939453125, + "learning_rate": 8.909333333333333e-07, + "loss": 0.0117, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1636 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.0625, + "epoch": 0.21826666666666666, + "grad_norm": 6.911016464233398, + "kl": 0.2890625, + "learning_rate": 8.908666666666666e-07, + "loss": 0.0116, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1637 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.5, + "epoch": 0.2184, + "grad_norm": 4.118130207061768, + "kl": 0.23681640625, + "learning_rate": 8.908e-07, + "loss": 0.0095, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1638 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.625, + "epoch": 0.21853333333333333, + "grad_norm": 6.980266094207764, + "kl": 0.17041015625, + "learning_rate": 8.907333333333333e-07, + "loss": 0.0068, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1639 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.125, + "epoch": 0.21866666666666668, + "grad_norm": 14.340261459350586, + "kl": 0.5126953125, + "learning_rate": 8.906666666666667e-07, + "loss": 0.0205, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1640 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.875, + "epoch": 0.2188, + "grad_norm": 8.923785209655762, + "kl": 0.2470703125, + "learning_rate": 8.905999999999999e-07, + "loss": 0.0099, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1641 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.4375, + "epoch": 0.21893333333333334, + "grad_norm": 5.453479290008545, + "kl": 0.267578125, + "learning_rate": 8.905333333333333e-07, + "loss": 0.0107, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1642 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.5, + "epoch": 0.21906666666666666, + "grad_norm": 7.779721736907959, + "kl": 0.22900390625, + "learning_rate": 8.904666666666666e-07, + "loss": 0.0092, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 1643 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.75, + "epoch": 0.2192, + "grad_norm": 11.352944374084473, + "kl": 0.421875, + "learning_rate": 8.904e-07, + "loss": 0.0169, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1644 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.8125, + "epoch": 0.21933333333333332, + "grad_norm": 9.817278861999512, + "kl": 0.302734375, + "learning_rate": 8.903333333333333e-07, + "loss": 0.0121, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 1645 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.8125, + "epoch": 0.21946666666666667, + "grad_norm": 10.564791679382324, + "kl": 0.41015625, + "learning_rate": 8.902666666666666e-07, + "loss": 0.0164, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1646 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.875, + "epoch": 0.2196, + "grad_norm": 5.671942710876465, + "kl": 0.1591796875, + "learning_rate": 8.902e-07, + "loss": 0.0064, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1647 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.0, + "epoch": 0.21973333333333334, + "grad_norm": 45.61763000488281, + "kl": 0.185546875, + "learning_rate": 8.901333333333333e-07, + "loss": 0.0074, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1648 + }, + { + "clip_ratio": 0.0, + "completion_length": 44.6875, + "epoch": 0.21986666666666665, + "grad_norm": 9.00725269317627, + "kl": 0.658203125, + "learning_rate": 8.900666666666667e-07, + "loss": 0.0263, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1649 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.4375, + "epoch": 0.22, + "grad_norm": 0.5108583569526672, + "kl": 0.23876953125, + "learning_rate": 8.9e-07, + "loss": 0.0095, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1650 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.375, + "epoch": 0.22013333333333332, + "grad_norm": 9.265876770019531, + "kl": 0.23486328125, + "learning_rate": 8.899333333333333e-07, + "loss": 0.0094, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1651 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.375, + "epoch": 0.22026666666666667, + "grad_norm": 0.784351646900177, + "kl": 0.4482421875, + "learning_rate": 8.898666666666666e-07, + "loss": 0.0179, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1652 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.0625, + "epoch": 0.2204, + "grad_norm": 8.936046600341797, + "kl": 0.2666015625, + "learning_rate": 8.898e-07, + "loss": 0.0106, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1653 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.6875, + "epoch": 0.22053333333333333, + "grad_norm": 9.943343162536621, + "kl": 0.20751953125, + "learning_rate": 8.897333333333333e-07, + "loss": 0.0083, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1654 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.125, + "epoch": 0.22066666666666668, + "grad_norm": 7.3268208503723145, + "kl": 0.5078125, + "learning_rate": 8.896666666666666e-07, + "loss": 0.0203, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1655 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.0, + "epoch": 0.2208, + "grad_norm": 0.4864865839481354, + "kl": 0.21875, + "learning_rate": 8.895999999999999e-07, + "loss": 0.0088, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1656 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.9375, + "epoch": 0.22093333333333334, + "grad_norm": 6.313357830047607, + "kl": 0.35009765625, + "learning_rate": 8.895333333333332e-07, + "loss": 0.014, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 1657 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.4375, + "epoch": 0.22106666666666666, + "grad_norm": 0.7385640740394592, + "kl": 0.3125, + "learning_rate": 8.894666666666666e-07, + "loss": 0.0125, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 1658 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.1875, + "epoch": 0.2212, + "grad_norm": 0.4527451694011688, + "kl": 0.2080078125, + "learning_rate": 8.893999999999999e-07, + "loss": 0.0083, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1659 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.6875, + "epoch": 0.22133333333333333, + "grad_norm": 12.6087007522583, + "kl": 0.3388671875, + "learning_rate": 8.893333333333333e-07, + "loss": 0.0136, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1660 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.875, + "epoch": 0.22146666666666667, + "grad_norm": 0.29496222734451294, + "kl": 0.19482421875, + "learning_rate": 8.892666666666666e-07, + "loss": 0.0078, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1661 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.6875, + "epoch": 0.2216, + "grad_norm": 0.7083157896995544, + "kl": 0.1904296875, + "learning_rate": 8.892e-07, + "loss": 0.0076, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1662 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.75, + "epoch": 0.22173333333333334, + "grad_norm": 10.594388961791992, + "kl": 0.43212890625, + "learning_rate": 8.891333333333333e-07, + "loss": 0.0173, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1663 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.25, + "epoch": 0.22186666666666666, + "grad_norm": 9.688993453979492, + "kl": 0.22314453125, + "learning_rate": 8.890666666666666e-07, + "loss": 0.0089, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1664 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.0, + "epoch": 0.222, + "grad_norm": 4.988584041595459, + "kl": 0.19970703125, + "learning_rate": 8.89e-07, + "loss": 0.008, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 1665 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.875, + "epoch": 0.22213333333333332, + "grad_norm": 0.4835761785507202, + "kl": 0.271484375, + "learning_rate": 8.889333333333333e-07, + "loss": 0.0108, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1666 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.4375, + "epoch": 0.22226666666666667, + "grad_norm": 21.007293701171875, + "kl": 0.4482421875, + "learning_rate": 8.888666666666667e-07, + "loss": 0.0179, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1667 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.5, + "epoch": 0.2224, + "grad_norm": 17.271081924438477, + "kl": 0.3759765625, + "learning_rate": 8.888e-07, + "loss": 0.0151, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1668 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.5, + "epoch": 0.22253333333333333, + "grad_norm": 0.40936383605003357, + "kl": 0.154052734375, + "learning_rate": 8.887333333333334e-07, + "loss": 0.0062, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1669 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.125, + "epoch": 0.22266666666666668, + "grad_norm": 92.00157928466797, + "kl": 0.341796875, + "learning_rate": 8.886666666666667e-07, + "loss": 0.0137, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1670 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.6875, + "epoch": 0.2228, + "grad_norm": 8.170095443725586, + "kl": 0.1767578125, + "learning_rate": 8.886e-07, + "loss": 0.0071, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1671 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.75, + "epoch": 0.22293333333333334, + "grad_norm": 8.213933944702148, + "kl": 0.3076171875, + "learning_rate": 8.885333333333332e-07, + "loss": 0.0123, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 1672 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.5, + "epoch": 0.22306666666666666, + "grad_norm": 11.107932090759277, + "kl": 0.1767578125, + "learning_rate": 8.884666666666665e-07, + "loss": 0.0071, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1673 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.25, + "epoch": 0.2232, + "grad_norm": 5.779315948486328, + "kl": 0.21337890625, + "learning_rate": 8.883999999999999e-07, + "loss": 0.0085, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 1674 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.875, + "epoch": 0.22333333333333333, + "grad_norm": 5.508561134338379, + "kl": 0.21240234375, + "learning_rate": 8.883333333333332e-07, + "loss": 0.0085, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1675 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.375, + "epoch": 0.22346666666666667, + "grad_norm": 6.723508834838867, + "kl": 0.19482421875, + "learning_rate": 8.882666666666666e-07, + "loss": 0.0078, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 1676 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.125, + "epoch": 0.2236, + "grad_norm": 0.5295045971870422, + "kl": 0.267578125, + "learning_rate": 8.881999999999999e-07, + "loss": 0.0107, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1677 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.375, + "epoch": 0.22373333333333334, + "grad_norm": 7.895566463470459, + "kl": 0.26123046875, + "learning_rate": 8.881333333333333e-07, + "loss": 0.0105, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 1678 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.25, + "epoch": 0.22386666666666666, + "grad_norm": 6.769871234893799, + "kl": 0.142822265625, + "learning_rate": 8.880666666666666e-07, + "loss": 0.0057, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1679 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.4375, + "epoch": 0.224, + "grad_norm": 0.5422409176826477, + "kl": 0.3603515625, + "learning_rate": 8.88e-07, + "loss": 0.0144, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1680 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.1875, + "epoch": 0.22413333333333332, + "grad_norm": 3.443497896194458, + "kl": 0.15576171875, + "learning_rate": 8.879333333333333e-07, + "loss": 0.0062, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1681 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.1875, + "epoch": 0.22426666666666667, + "grad_norm": 5.121555328369141, + "kl": 0.1728515625, + "learning_rate": 8.878666666666667e-07, + "loss": 0.0069, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1682 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.9375, + "epoch": 0.2244, + "grad_norm": 34.024993896484375, + "kl": 0.3017578125, + "learning_rate": 8.878e-07, + "loss": 0.0121, + "reward": 1.6875, + "reward_std": 0.6396867483854294, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.8125, + "step": 1683 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.375, + "epoch": 0.22453333333333333, + "grad_norm": 13.336943626403809, + "kl": 0.189453125, + "learning_rate": 8.877333333333333e-07, + "loss": 0.0076, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1684 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.4375, + "epoch": 0.22466666666666665, + "grad_norm": 46.8646354675293, + "kl": 0.541015625, + "learning_rate": 8.876666666666667e-07, + "loss": 0.0216, + "reward": 1.4375, + "reward_std": 0.7269706428050995, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.8125, + "step": 1685 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.625, + "epoch": 0.2248, + "grad_norm": 5.1765875816345215, + "kl": 0.24462890625, + "learning_rate": 8.875999999999999e-07, + "loss": 0.0098, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 1686 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.25, + "epoch": 0.22493333333333335, + "grad_norm": 5.480419635772705, + "kl": 0.23291015625, + "learning_rate": 8.875333333333333e-07, + "loss": 0.0093, + "reward": 1.1875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 1687 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.5, + "epoch": 0.22506666666666666, + "grad_norm": 4.309286594390869, + "kl": 0.175048828125, + "learning_rate": 8.874666666666666e-07, + "loss": 0.007, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1688 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.1875, + "epoch": 0.2252, + "grad_norm": 4.092801094055176, + "kl": 0.205078125, + "learning_rate": 8.874e-07, + "loss": 0.0082, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1689 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.5, + "epoch": 0.22533333333333333, + "grad_norm": 6.667372703552246, + "kl": 0.21337890625, + "learning_rate": 8.873333333333333e-07, + "loss": 0.0086, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1690 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.875, + "epoch": 0.22546666666666668, + "grad_norm": 8.328632354736328, + "kl": 0.21142578125, + "learning_rate": 8.872666666666667e-07, + "loss": 0.0085, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 1691 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.9375, + "epoch": 0.2256, + "grad_norm": 10.256142616271973, + "kl": 0.45654296875, + "learning_rate": 8.872e-07, + "loss": 0.0182, + "reward": 1.25, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 1692 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.375, + "epoch": 0.22573333333333334, + "grad_norm": 5.712655067443848, + "kl": 0.220703125, + "learning_rate": 8.871333333333332e-07, + "loss": 0.0088, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1693 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.5625, + "epoch": 0.22586666666666666, + "grad_norm": 5.2334370613098145, + "kl": 0.1748046875, + "learning_rate": 8.870666666666666e-07, + "loss": 0.007, + "reward": 1.0625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 1694 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.8125, + "epoch": 0.226, + "grad_norm": 8.42894458770752, + "kl": 0.1962890625, + "learning_rate": 8.869999999999999e-07, + "loss": 0.0078, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1695 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.3125, + "epoch": 0.22613333333333333, + "grad_norm": 9.279065132141113, + "kl": 0.17626953125, + "learning_rate": 8.869333333333333e-07, + "loss": 0.007, + "reward": 1.0, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.875, + "step": 1696 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.8125, + "epoch": 0.22626666666666667, + "grad_norm": 11.558822631835938, + "kl": 0.2373046875, + "learning_rate": 8.868666666666666e-07, + "loss": 0.0095, + "reward": 0.9375, + "reward_std": 0.6396867483854294, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.75, + "step": 1697 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.6875, + "epoch": 0.2264, + "grad_norm": 7.309298038482666, + "kl": 0.1787109375, + "learning_rate": 8.868e-07, + "loss": 0.0071, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 1698 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.8125, + "epoch": 0.22653333333333334, + "grad_norm": 8.484354972839355, + "kl": 0.27783203125, + "learning_rate": 8.867333333333333e-07, + "loss": 0.0111, + "reward": 1.3125, + "reward_std": 0.6739883720874786, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.8125, + "step": 1699 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.5625, + "epoch": 0.22666666666666666, + "grad_norm": 8.585436820983887, + "kl": 0.3017578125, + "learning_rate": 8.866666666666667e-07, + "loss": 0.0121, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1700 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.0625, + "epoch": 0.2268, + "grad_norm": 5.693006992340088, + "kl": 0.1455078125, + "learning_rate": 8.866e-07, + "loss": 0.0058, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1701 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.125, + "epoch": 0.22693333333333332, + "grad_norm": 7.0129852294921875, + "kl": 0.255859375, + "learning_rate": 8.865333333333332e-07, + "loss": 0.0103, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 1702 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.0, + "epoch": 0.22706666666666667, + "grad_norm": 10.39387035369873, + "kl": 0.201171875, + "learning_rate": 8.864666666666666e-07, + "loss": 0.0081, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 1703 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.3125, + "epoch": 0.2272, + "grad_norm": 11.072248458862305, + "kl": 0.18896484375, + "learning_rate": 8.863999999999999e-07, + "loss": 0.0076, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 1704 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.9375, + "epoch": 0.22733333333333333, + "grad_norm": 6.059688568115234, + "kl": 0.408203125, + "learning_rate": 8.863333333333333e-07, + "loss": 0.0163, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 1705 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.0, + "epoch": 0.22746666666666668, + "grad_norm": 42.517295837402344, + "kl": 2.38134765625, + "learning_rate": 8.862666666666666e-07, + "loss": 0.0959, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 1706 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.9375, + "epoch": 0.2276, + "grad_norm": 6.203853607177734, + "kl": 0.18603515625, + "learning_rate": 8.862e-07, + "loss": 0.0074, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1707 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.5625, + "epoch": 0.22773333333333334, + "grad_norm": 6.547203540802002, + "kl": 0.2041015625, + "learning_rate": 8.861333333333333e-07, + "loss": 0.0082, + "reward": 1.125, + "reward_std": 0.6348394006490707, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.8125, + "step": 1708 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.125, + "epoch": 0.22786666666666666, + "grad_norm": 12.442816734313965, + "kl": 0.40234375, + "learning_rate": 8.860666666666667e-07, + "loss": 0.0161, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1709 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.1875, + "epoch": 0.228, + "grad_norm": 7.237114906311035, + "kl": 0.19482421875, + "learning_rate": 8.86e-07, + "loss": 0.0078, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 1710 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.125, + "epoch": 0.22813333333333333, + "grad_norm": 9.811724662780762, + "kl": 0.24853515625, + "learning_rate": 8.859333333333333e-07, + "loss": 0.01, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1711 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.625, + "epoch": 0.22826666666666667, + "grad_norm": 8.10726261138916, + "kl": 0.28515625, + "learning_rate": 8.858666666666667e-07, + "loss": 0.0114, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1712 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.0, + "epoch": 0.2284, + "grad_norm": 15.848331451416016, + "kl": 0.3408203125, + "learning_rate": 8.858e-07, + "loss": 0.0136, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 1713 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.4375, + "epoch": 0.22853333333333334, + "grad_norm": 30.318613052368164, + "kl": 0.2607421875, + "learning_rate": 8.857333333333334e-07, + "loss": 0.0104, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 1714 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.625, + "epoch": 0.22866666666666666, + "grad_norm": 5.214493751525879, + "kl": 0.18212890625, + "learning_rate": 8.856666666666666e-07, + "loss": 0.0073, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1715 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.25, + "epoch": 0.2288, + "grad_norm": 7.6611647605896, + "kl": 0.35791015625, + "learning_rate": 8.856e-07, + "loss": 0.0144, + "reward": 1.375, + "reward_std": 0.579209566116333, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1716 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.9375, + "epoch": 0.22893333333333332, + "grad_norm": 6.201143741607666, + "kl": 0.14892578125, + "learning_rate": 8.855333333333332e-07, + "loss": 0.006, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1717 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.5625, + "epoch": 0.22906666666666667, + "grad_norm": 6.984725475311279, + "kl": 0.3017578125, + "learning_rate": 8.854666666666666e-07, + "loss": 0.0121, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1718 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.625, + "epoch": 0.2292, + "grad_norm": 9.064791679382324, + "kl": 0.2626953125, + "learning_rate": 8.853999999999999e-07, + "loss": 0.0105, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1719 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.4375, + "epoch": 0.22933333333333333, + "grad_norm": 8.3569974899292, + "kl": 0.1884765625, + "learning_rate": 8.853333333333332e-07, + "loss": 0.0075, + "reward": 1.4375, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1720 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.625, + "epoch": 0.22946666666666668, + "grad_norm": 10.342194557189941, + "kl": 0.18505859375, + "learning_rate": 8.852666666666666e-07, + "loss": 0.0074, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 1721 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.4375, + "epoch": 0.2296, + "grad_norm": 32.3656005859375, + "kl": 0.192138671875, + "learning_rate": 8.851999999999999e-07, + "loss": 0.0077, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1722 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.0, + "epoch": 0.22973333333333334, + "grad_norm": 7.275010108947754, + "kl": 0.18359375, + "learning_rate": 8.851333333333333e-07, + "loss": 0.0073, + "reward": 1.4375, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1723 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.25, + "epoch": 0.22986666666666666, + "grad_norm": 6.547823429107666, + "kl": 0.2255859375, + "learning_rate": 8.850666666666666e-07, + "loss": 0.009, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1724 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.375, + "epoch": 0.23, + "grad_norm": 24.129728317260742, + "kl": 1.142578125, + "learning_rate": 8.85e-07, + "loss": 0.0458, + "reward": 1.625, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 1725 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.5, + "epoch": 0.23013333333333333, + "grad_norm": 6.957425117492676, + "kl": 0.291015625, + "learning_rate": 8.849333333333333e-07, + "loss": 0.0116, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1726 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.125, + "epoch": 0.23026666666666668, + "grad_norm": 14.05274772644043, + "kl": 0.65869140625, + "learning_rate": 8.848666666666667e-07, + "loss": 0.0265, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 1727 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.4375, + "epoch": 0.2304, + "grad_norm": 7.658838272094727, + "kl": 0.171875, + "learning_rate": 8.848e-07, + "loss": 0.0069, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1728 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.125, + "epoch": 0.23053333333333334, + "grad_norm": 4.321747303009033, + "kl": 0.13720703125, + "learning_rate": 8.847333333333334e-07, + "loss": 0.0055, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1729 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.25, + "epoch": 0.23066666666666666, + "grad_norm": 30.740955352783203, + "kl": 0.51611328125, + "learning_rate": 8.846666666666667e-07, + "loss": 0.0207, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1730 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.0625, + "epoch": 0.2308, + "grad_norm": 6.1208038330078125, + "kl": 0.1884765625, + "learning_rate": 8.846e-07, + "loss": 0.0075, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 1731 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.6875, + "epoch": 0.23093333333333332, + "grad_norm": 12.150681495666504, + "kl": 0.22705078125, + "learning_rate": 8.845333333333333e-07, + "loss": 0.0091, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 1732 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.3125, + "epoch": 0.23106666666666667, + "grad_norm": 6.358458518981934, + "kl": 0.15283203125, + "learning_rate": 8.844666666666666e-07, + "loss": 0.0061, + "reward": 1.375, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1733 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.875, + "epoch": 0.2312, + "grad_norm": 5.0455498695373535, + "kl": 0.149658203125, + "learning_rate": 8.844e-07, + "loss": 0.006, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1734 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.875, + "epoch": 0.23133333333333334, + "grad_norm": 6.2399468421936035, + "kl": 0.16845703125, + "learning_rate": 8.843333333333332e-07, + "loss": 0.0067, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 1735 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.875, + "epoch": 0.23146666666666665, + "grad_norm": 9.772724151611328, + "kl": 0.306640625, + "learning_rate": 8.842666666666666e-07, + "loss": 0.0123, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 1736 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.25, + "epoch": 0.2316, + "grad_norm": 7.923486232757568, + "kl": 0.17919921875, + "learning_rate": 8.841999999999999e-07, + "loss": 0.0072, + "reward": 1.6875, + "reward_std": 0.45806270837783813, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.875, + "step": 1737 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.9375, + "epoch": 0.23173333333333335, + "grad_norm": 3.6558775901794434, + "kl": 0.174072265625, + "learning_rate": 8.841333333333333e-07, + "loss": 0.007, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1738 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.5, + "epoch": 0.23186666666666667, + "grad_norm": 0.5622139573097229, + "kl": 0.18701171875, + "learning_rate": 8.840666666666666e-07, + "loss": 0.0075, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1739 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.3125, + "epoch": 0.232, + "grad_norm": 17.108671188354492, + "kl": 0.16552734375, + "learning_rate": 8.839999999999999e-07, + "loss": 0.0066, + "reward": 1.375, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1740 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.875, + "epoch": 0.23213333333333333, + "grad_norm": 9.137226104736328, + "kl": 0.18310546875, + "learning_rate": 8.839333333333333e-07, + "loss": 0.0073, + "reward": 1.1875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 1741 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.6875, + "epoch": 0.23226666666666668, + "grad_norm": 4.082355976104736, + "kl": 0.157958984375, + "learning_rate": 8.838666666666666e-07, + "loss": 0.0063, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1742 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.1875, + "epoch": 0.2324, + "grad_norm": 6.57108736038208, + "kl": 0.19873046875, + "learning_rate": 8.838e-07, + "loss": 0.008, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1743 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.0, + "epoch": 0.23253333333333334, + "grad_norm": 6.203197956085205, + "kl": 0.1298828125, + "learning_rate": 8.837333333333333e-07, + "loss": 0.0052, + "reward": 1.1875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 1744 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.1875, + "epoch": 0.23266666666666666, + "grad_norm": 9.401361465454102, + "kl": 0.162109375, + "learning_rate": 8.836666666666667e-07, + "loss": 0.0065, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1745 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.125, + "epoch": 0.2328, + "grad_norm": 5.245575428009033, + "kl": 0.15283203125, + "learning_rate": 8.836e-07, + "loss": 0.0061, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1746 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.875, + "epoch": 0.23293333333333333, + "grad_norm": 0.3458852171897888, + "kl": 0.13916015625, + "learning_rate": 8.835333333333333e-07, + "loss": 0.0056, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 1747 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.25, + "epoch": 0.23306666666666667, + "grad_norm": 5.491477012634277, + "kl": 0.22802734375, + "learning_rate": 8.834666666666666e-07, + "loss": 0.0091, + "reward": 1.0625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 1748 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.0, + "epoch": 0.2332, + "grad_norm": 7.104381561279297, + "kl": 0.1953125, + "learning_rate": 8.833999999999999e-07, + "loss": 0.0078, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1749 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.6875, + "epoch": 0.23333333333333334, + "grad_norm": 8.012308120727539, + "kl": 0.17236328125, + "learning_rate": 8.833333333333333e-07, + "loss": 0.0069, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.875, + "step": 1750 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.5, + "epoch": 0.23346666666666666, + "grad_norm": 5.675718307495117, + "kl": 0.247314453125, + "learning_rate": 8.832666666666666e-07, + "loss": 0.0099, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 1751 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.375, + "epoch": 0.2336, + "grad_norm": 5.472855091094971, + "kl": 0.17822265625, + "learning_rate": 8.832e-07, + "loss": 0.0071, + "reward": 1.1875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 1752 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.6875, + "epoch": 0.23373333333333332, + "grad_norm": 5.2444305419921875, + "kl": 0.1591796875, + "learning_rate": 8.831333333333333e-07, + "loss": 0.0064, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1753 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.8125, + "epoch": 0.23386666666666667, + "grad_norm": 6.057385444641113, + "kl": 0.2548828125, + "learning_rate": 8.830666666666667e-07, + "loss": 0.0102, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 1754 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.6875, + "epoch": 0.234, + "grad_norm": 9.363860130310059, + "kl": 0.1845703125, + "learning_rate": 8.83e-07, + "loss": 0.0074, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1755 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.25, + "epoch": 0.23413333333333333, + "grad_norm": 4.6095991134643555, + "kl": 0.224609375, + "learning_rate": 8.829333333333334e-07, + "loss": 0.009, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 1756 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.8125, + "epoch": 0.23426666666666668, + "grad_norm": 3.9308722019195557, + "kl": 0.1796875, + "learning_rate": 8.828666666666666e-07, + "loss": 0.0072, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 1757 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.3125, + "epoch": 0.2344, + "grad_norm": 8.726221084594727, + "kl": 0.1455078125, + "learning_rate": 8.827999999999999e-07, + "loss": 0.0058, + "reward": 1.375, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1758 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.6875, + "epoch": 0.23453333333333334, + "grad_norm": 4.215245246887207, + "kl": 0.15966796875, + "learning_rate": 8.827333333333333e-07, + "loss": 0.0064, + "reward": 1.6875, + "reward_std": 0.45806270837783813, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.875, + "step": 1759 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.125, + "epoch": 0.23466666666666666, + "grad_norm": 9.142005920410156, + "kl": 0.18896484375, + "learning_rate": 8.826666666666666e-07, + "loss": 0.0076, + "reward": 1.25, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 1760 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.9375, + "epoch": 0.2348, + "grad_norm": 7.327962875366211, + "kl": 0.25830078125, + "learning_rate": 8.826e-07, + "loss": 0.0103, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1761 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.9375, + "epoch": 0.23493333333333333, + "grad_norm": 8.874978065490723, + "kl": 0.19189453125, + "learning_rate": 8.825333333333332e-07, + "loss": 0.0077, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1762 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.4375, + "epoch": 0.23506666666666667, + "grad_norm": 7.6941046714782715, + "kl": 0.13525390625, + "learning_rate": 8.824666666666666e-07, + "loss": 0.0054, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1763 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.8125, + "epoch": 0.2352, + "grad_norm": 5.9755940437316895, + "kl": 0.15869140625, + "learning_rate": 8.823999999999999e-07, + "loss": 0.0064, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1764 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.25, + "epoch": 0.23533333333333334, + "grad_norm": 17.182861328125, + "kl": 0.26025390625, + "learning_rate": 8.823333333333333e-07, + "loss": 0.0104, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 1765 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.1875, + "epoch": 0.23546666666666666, + "grad_norm": 7.039700508117676, + "kl": 0.173583984375, + "learning_rate": 8.822666666666666e-07, + "loss": 0.007, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1766 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.375, + "epoch": 0.2356, + "grad_norm": 7.607717990875244, + "kl": 0.205078125, + "learning_rate": 8.821999999999999e-07, + "loss": 0.0082, + "reward": 1.5, + "reward_std": 0.7071067690849304, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 1767 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.625, + "epoch": 0.23573333333333332, + "grad_norm": 11.240504264831543, + "kl": 0.21435546875, + "learning_rate": 8.821333333333333e-07, + "loss": 0.0086, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1768 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.375, + "epoch": 0.23586666666666667, + "grad_norm": 4.199526786804199, + "kl": 0.15234375, + "learning_rate": 8.820666666666666e-07, + "loss": 0.0061, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1769 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.5625, + "epoch": 0.236, + "grad_norm": 5.099433422088623, + "kl": 0.19873046875, + "learning_rate": 8.82e-07, + "loss": 0.0079, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1770 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.9375, + "epoch": 0.23613333333333333, + "grad_norm": 11.522933006286621, + "kl": 0.16259765625, + "learning_rate": 8.819333333333333e-07, + "loss": 0.0065, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1771 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.9375, + "epoch": 0.23626666666666668, + "grad_norm": 7.172472953796387, + "kl": 0.1865234375, + "learning_rate": 8.818666666666667e-07, + "loss": 0.0075, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1772 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.5, + "epoch": 0.2364, + "grad_norm": 0.5922273397445679, + "kl": 0.2314453125, + "learning_rate": 8.818e-07, + "loss": 0.0093, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1773 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.75, + "epoch": 0.23653333333333335, + "grad_norm": 7.386436462402344, + "kl": 0.18212890625, + "learning_rate": 8.817333333333334e-07, + "loss": 0.0073, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1774 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.75, + "epoch": 0.23666666666666666, + "grad_norm": 12.589827537536621, + "kl": 0.52978515625, + "learning_rate": 8.816666666666667e-07, + "loss": 0.0212, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 1775 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.1875, + "epoch": 0.2368, + "grad_norm": 7.028797149658203, + "kl": 0.2646484375, + "learning_rate": 8.816000000000001e-07, + "loss": 0.0106, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1776 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.1875, + "epoch": 0.23693333333333333, + "grad_norm": 5.6260809898376465, + "kl": 0.17626953125, + "learning_rate": 8.815333333333332e-07, + "loss": 0.007, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1777 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.0, + "epoch": 0.23706666666666668, + "grad_norm": 6.841703414916992, + "kl": 0.23974609375, + "learning_rate": 8.814666666666665e-07, + "loss": 0.0096, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1778 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.75, + "epoch": 0.2372, + "grad_norm": 4.597301006317139, + "kl": 0.16943359375, + "learning_rate": 8.813999999999999e-07, + "loss": 0.0068, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1779 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.75, + "epoch": 0.23733333333333334, + "grad_norm": 4.774216651916504, + "kl": 0.177734375, + "learning_rate": 8.813333333333332e-07, + "loss": 0.0071, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1780 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.625, + "epoch": 0.23746666666666666, + "grad_norm": 7.507312774658203, + "kl": 0.27734375, + "learning_rate": 8.812666666666666e-07, + "loss": 0.0111, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1781 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.3125, + "epoch": 0.2376, + "grad_norm": 0.39815181493759155, + "kl": 0.185546875, + "learning_rate": 8.811999999999999e-07, + "loss": 0.0074, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1782 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.375, + "epoch": 0.23773333333333332, + "grad_norm": 9.361734390258789, + "kl": 0.28857421875, + "learning_rate": 8.811333333333333e-07, + "loss": 0.0115, + "reward": 1.0625, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.875, + "step": 1783 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.5, + "epoch": 0.23786666666666667, + "grad_norm": 7.433305263519287, + "kl": 0.265625, + "learning_rate": 8.810666666666666e-07, + "loss": 0.0106, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1784 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.9375, + "epoch": 0.238, + "grad_norm": 5.791871547698975, + "kl": 0.15576171875, + "learning_rate": 8.81e-07, + "loss": 0.0062, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1785 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.5625, + "epoch": 0.23813333333333334, + "grad_norm": 4.924501895904541, + "kl": 0.22607421875, + "learning_rate": 8.809333333333333e-07, + "loss": 0.009, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1786 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.125, + "epoch": 0.23826666666666665, + "grad_norm": 19.793237686157227, + "kl": 0.2587890625, + "learning_rate": 8.808666666666666e-07, + "loss": 0.0104, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1787 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.375, + "epoch": 0.2384, + "grad_norm": 0.5818777680397034, + "kl": 0.25732421875, + "learning_rate": 8.808e-07, + "loss": 0.0103, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1788 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.5625, + "epoch": 0.23853333333333335, + "grad_norm": 5.30240535736084, + "kl": 0.2158203125, + "learning_rate": 8.807333333333333e-07, + "loss": 0.0086, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 1789 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.25, + "epoch": 0.23866666666666667, + "grad_norm": 9.288991928100586, + "kl": 0.2177734375, + "learning_rate": 8.806666666666667e-07, + "loss": 0.0087, + "reward": 1.0625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 1790 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.75, + "epoch": 0.2388, + "grad_norm": 0.6431390643119812, + "kl": 0.18310546875, + "learning_rate": 8.806e-07, + "loss": 0.0073, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1791 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.25, + "epoch": 0.23893333333333333, + "grad_norm": 5.073430061340332, + "kl": 0.2021484375, + "learning_rate": 8.805333333333333e-07, + "loss": 0.0081, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1792 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.25, + "epoch": 0.23906666666666668, + "grad_norm": 8.628493309020996, + "kl": 0.28125, + "learning_rate": 8.804666666666666e-07, + "loss": 0.0112, + "reward": 1.0, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 1793 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.0625, + "epoch": 0.2392, + "grad_norm": 10.018132209777832, + "kl": 0.27294921875, + "learning_rate": 8.804e-07, + "loss": 0.0109, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1794 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.375, + "epoch": 0.23933333333333334, + "grad_norm": 9.550463676452637, + "kl": 0.443359375, + "learning_rate": 8.803333333333333e-07, + "loss": 0.0178, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 1795 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.125, + "epoch": 0.23946666666666666, + "grad_norm": 10.800012588500977, + "kl": 0.294921875, + "learning_rate": 8.802666666666666e-07, + "loss": 0.0118, + "reward": 1.6875, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.875, + "step": 1796 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.0625, + "epoch": 0.2396, + "grad_norm": 14.609504699707031, + "kl": 0.2568359375, + "learning_rate": 8.802e-07, + "loss": 0.0103, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1797 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.75, + "epoch": 0.23973333333333333, + "grad_norm": 9.401741981506348, + "kl": 0.26611328125, + "learning_rate": 8.801333333333332e-07, + "loss": 0.0106, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1798 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.875, + "epoch": 0.23986666666666667, + "grad_norm": 10.959148406982422, + "kl": 0.3505859375, + "learning_rate": 8.800666666666666e-07, + "loss": 0.014, + "reward": 1.375, + "reward_std": 0.7315178513526917, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 1799 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.875, + "epoch": 0.24, + "grad_norm": 0.9541487693786621, + "kl": 0.30126953125, + "learning_rate": 8.799999999999999e-07, + "loss": 0.012, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1800 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.6875, + "epoch": 0.24013333333333334, + "grad_norm": 9.064167022705078, + "kl": 0.2138671875, + "learning_rate": 8.799333333333333e-07, + "loss": 0.0085, + "reward": 1.4375, + "reward_std": 0.6739883720874786, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 1801 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.6875, + "epoch": 0.24026666666666666, + "grad_norm": 10.76862621307373, + "kl": 0.31640625, + "learning_rate": 8.798666666666666e-07, + "loss": 0.0127, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1802 + }, + { + "clip_ratio": 0.0, + "completion_length": 42.75, + "epoch": 0.2404, + "grad_norm": 0.8384894728660583, + "kl": 0.48828125, + "learning_rate": 8.798e-07, + "loss": 0.0195, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1803 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.875, + "epoch": 0.24053333333333332, + "grad_norm": 15.967658042907715, + "kl": 0.3623046875, + "learning_rate": 8.797333333333333e-07, + "loss": 0.0145, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1804 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.0, + "epoch": 0.24066666666666667, + "grad_norm": 4.836309432983398, + "kl": 0.23095703125, + "learning_rate": 8.796666666666666e-07, + "loss": 0.0092, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 1805 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.6875, + "epoch": 0.2408, + "grad_norm": 13.17647647857666, + "kl": 0.2939453125, + "learning_rate": 8.796e-07, + "loss": 0.0118, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1806 + }, + { + "clip_ratio": 0.0, + "completion_length": 52.9375, + "epoch": 0.24093333333333333, + "grad_norm": 8.685728073120117, + "kl": 0.5849609375, + "learning_rate": 8.795333333333332e-07, + "loss": 0.0234, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1807 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.4375, + "epoch": 0.24106666666666668, + "grad_norm": 14.383845329284668, + "kl": 0.537109375, + "learning_rate": 8.794666666666666e-07, + "loss": 0.0215, + "reward": 1.1875, + "reward_std": 0.6396867483854294, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.8125, + "step": 1808 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.25, + "epoch": 0.2412, + "grad_norm": 10.097926139831543, + "kl": 0.255859375, + "learning_rate": 8.793999999999999e-07, + "loss": 0.0102, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1809 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.625, + "epoch": 0.24133333333333334, + "grad_norm": 15.82018756866455, + "kl": 0.412109375, + "learning_rate": 8.793333333333333e-07, + "loss": 0.0165, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1810 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.875, + "epoch": 0.24146666666666666, + "grad_norm": 28.93244743347168, + "kl": 0.322265625, + "learning_rate": 8.792666666666666e-07, + "loss": 0.0129, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 1811 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.25, + "epoch": 0.2416, + "grad_norm": 9.092199325561523, + "kl": 0.21533203125, + "learning_rate": 8.792e-07, + "loss": 0.0086, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1812 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.8125, + "epoch": 0.24173333333333333, + "grad_norm": 10.019381523132324, + "kl": 0.2333984375, + "learning_rate": 8.791333333333333e-07, + "loss": 0.0093, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1813 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.9375, + "epoch": 0.24186666666666667, + "grad_norm": 8.20236873626709, + "kl": 0.28125, + "learning_rate": 8.790666666666666e-07, + "loss": 0.0112, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1814 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.125, + "epoch": 0.242, + "grad_norm": 16.326242446899414, + "kl": 0.3974609375, + "learning_rate": 8.79e-07, + "loss": 0.0159, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1815 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.9375, + "epoch": 0.24213333333333334, + "grad_norm": 14.381290435791016, + "kl": 0.3173828125, + "learning_rate": 8.789333333333333e-07, + "loss": 0.0127, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1816 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.375, + "epoch": 0.24226666666666666, + "grad_norm": 5.3151469230651855, + "kl": 0.341796875, + "learning_rate": 8.788666666666667e-07, + "loss": 0.0136, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1817 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.4375, + "epoch": 0.2424, + "grad_norm": 5.843325138092041, + "kl": 0.3662109375, + "learning_rate": 8.788e-07, + "loss": 0.0147, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1818 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.375, + "epoch": 0.24253333333333332, + "grad_norm": 9.725837707519531, + "kl": 0.34375, + "learning_rate": 8.787333333333334e-07, + "loss": 0.0138, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1819 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.3125, + "epoch": 0.24266666666666667, + "grad_norm": 5.253890514373779, + "kl": 0.2822265625, + "learning_rate": 8.786666666666666e-07, + "loss": 0.0113, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1820 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.75, + "epoch": 0.2428, + "grad_norm": 12.208366394042969, + "kl": 0.4130859375, + "learning_rate": 8.786e-07, + "loss": 0.0165, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 1821 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.0625, + "epoch": 0.24293333333333333, + "grad_norm": 21.907949447631836, + "kl": 0.3642578125, + "learning_rate": 8.785333333333333e-07, + "loss": 0.0146, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 1822 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.6875, + "epoch": 0.24306666666666665, + "grad_norm": 0.6521739363670349, + "kl": 0.326171875, + "learning_rate": 8.784666666666665e-07, + "loss": 0.013, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 1823 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.25, + "epoch": 0.2432, + "grad_norm": 10.113819122314453, + "kl": 0.25634765625, + "learning_rate": 8.783999999999999e-07, + "loss": 0.0103, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1824 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.375, + "epoch": 0.24333333333333335, + "grad_norm": 9.22133731842041, + "kl": 0.24267578125, + "learning_rate": 8.783333333333332e-07, + "loss": 0.0097, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1825 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.1875, + "epoch": 0.24346666666666666, + "grad_norm": 7.285531520843506, + "kl": 0.296875, + "learning_rate": 8.782666666666666e-07, + "loss": 0.0119, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1826 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.5625, + "epoch": 0.2436, + "grad_norm": 10.943934440612793, + "kl": 0.3193359375, + "learning_rate": 8.781999999999999e-07, + "loss": 0.0128, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1827 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.9375, + "epoch": 0.24373333333333333, + "grad_norm": 9.262518882751465, + "kl": 0.25732421875, + "learning_rate": 8.781333333333333e-07, + "loss": 0.0103, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1828 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.0, + "epoch": 0.24386666666666668, + "grad_norm": 9.96657657623291, + "kl": 0.30126953125, + "learning_rate": 8.780666666666666e-07, + "loss": 0.0121, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1829 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.125, + "epoch": 0.244, + "grad_norm": 9.37876033782959, + "kl": 0.296875, + "learning_rate": 8.78e-07, + "loss": 0.0119, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1830 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.1875, + "epoch": 0.24413333333333334, + "grad_norm": 8.658432960510254, + "kl": 0.29248046875, + "learning_rate": 8.779333333333333e-07, + "loss": 0.0117, + "reward": 1.4375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1831 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.3125, + "epoch": 0.24426666666666666, + "grad_norm": 5.847431659698486, + "kl": 0.279296875, + "learning_rate": 8.778666666666667e-07, + "loss": 0.0112, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1832 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.375, + "epoch": 0.2444, + "grad_norm": 27.661733627319336, + "kl": 0.4091796875, + "learning_rate": 8.778e-07, + "loss": 0.0164, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 1833 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.75, + "epoch": 0.24453333333333332, + "grad_norm": 8.544075965881348, + "kl": 0.2822265625, + "learning_rate": 8.777333333333333e-07, + "loss": 0.0113, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.875, + "step": 1834 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.25, + "epoch": 0.24466666666666667, + "grad_norm": 0.4169991612434387, + "kl": 0.28564453125, + "learning_rate": 8.776666666666667e-07, + "loss": 0.0114, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1835 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.875, + "epoch": 0.2448, + "grad_norm": 279.2618408203125, + "kl": 0.322265625, + "learning_rate": 8.776e-07, + "loss": 0.0129, + "reward": 1.5625, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 1836 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.8125, + "epoch": 0.24493333333333334, + "grad_norm": 10.775151252746582, + "kl": 0.30859375, + "learning_rate": 8.775333333333334e-07, + "loss": 0.0123, + "reward": 1.3125, + "reward_std": 0.6983994543552399, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.8125, + "step": 1837 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.0625, + "epoch": 0.24506666666666665, + "grad_norm": 7.280965328216553, + "kl": 0.224609375, + "learning_rate": 8.774666666666666e-07, + "loss": 0.009, + "reward": 1.625, + "reward_std": 0.4432026147842407, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 1838 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.1875, + "epoch": 0.2452, + "grad_norm": 12.553099632263184, + "kl": 0.369140625, + "learning_rate": 8.774e-07, + "loss": 0.0148, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1839 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.8125, + "epoch": 0.24533333333333332, + "grad_norm": 2.9007081985473633, + "kl": 0.25390625, + "learning_rate": 8.773333333333332e-07, + "loss": 0.0102, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1840 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.9375, + "epoch": 0.24546666666666667, + "grad_norm": 13.69372272491455, + "kl": 0.4130859375, + "learning_rate": 8.772666666666666e-07, + "loss": 0.0165, + "reward": 1.5625, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 1841 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.0, + "epoch": 0.2456, + "grad_norm": 11.729949951171875, + "kl": 0.2080078125, + "learning_rate": 8.771999999999999e-07, + "loss": 0.0083, + "reward": 1.5625, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 1842 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.0625, + "epoch": 0.24573333333333333, + "grad_norm": 6.820145130157471, + "kl": 0.21923828125, + "learning_rate": 8.771333333333332e-07, + "loss": 0.0088, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 1843 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.3125, + "epoch": 0.24586666666666668, + "grad_norm": 5.812421798706055, + "kl": 0.24755859375, + "learning_rate": 8.770666666666666e-07, + "loss": 0.0099, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1844 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.5, + "epoch": 0.246, + "grad_norm": 4.375680923461914, + "kl": 0.19580078125, + "learning_rate": 8.769999999999999e-07, + "loss": 0.0079, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 1845 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.5, + "epoch": 0.24613333333333334, + "grad_norm": 5.89382266998291, + "kl": 0.2001953125, + "learning_rate": 8.769333333333333e-07, + "loss": 0.008, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1846 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.1875, + "epoch": 0.24626666666666666, + "grad_norm": 7.525083541870117, + "kl": 0.2939453125, + "learning_rate": 8.768666666666666e-07, + "loss": 0.0118, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1847 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.6875, + "epoch": 0.2464, + "grad_norm": 12.096664428710938, + "kl": 0.23193359375, + "learning_rate": 8.768e-07, + "loss": 0.0093, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1848 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.4375, + "epoch": 0.24653333333333333, + "grad_norm": 6.7323689460754395, + "kl": 0.1875, + "learning_rate": 8.767333333333333e-07, + "loss": 0.0075, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1849 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.75, + "epoch": 0.24666666666666667, + "grad_norm": 10.135492324829102, + "kl": 0.21435546875, + "learning_rate": 8.766666666666667e-07, + "loss": 0.0086, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1850 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.875, + "epoch": 0.2468, + "grad_norm": 6.683185577392578, + "kl": 0.23388671875, + "learning_rate": 8.766e-07, + "loss": 0.0093, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1851 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.125, + "epoch": 0.24693333333333334, + "grad_norm": 6.929636001586914, + "kl": 0.486328125, + "learning_rate": 8.765333333333333e-07, + "loss": 0.0194, + "reward": 1.6875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 1852 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.625, + "epoch": 0.24706666666666666, + "grad_norm": 5.23858642578125, + "kl": 0.283203125, + "learning_rate": 8.764666666666666e-07, + "loss": 0.0113, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1853 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.375, + "epoch": 0.2472, + "grad_norm": 31.5235538482666, + "kl": 0.27685546875, + "learning_rate": 8.763999999999999e-07, + "loss": 0.0111, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1854 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.1875, + "epoch": 0.24733333333333332, + "grad_norm": 7.601171493530273, + "kl": 0.26416015625, + "learning_rate": 8.763333333333333e-07, + "loss": 0.0106, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1855 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.25, + "epoch": 0.24746666666666667, + "grad_norm": 3.4883320331573486, + "kl": 0.271484375, + "learning_rate": 8.762666666666666e-07, + "loss": 0.0109, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1856 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.0625, + "epoch": 0.2476, + "grad_norm": 7.39101505279541, + "kl": 0.2294921875, + "learning_rate": 8.762e-07, + "loss": 0.0092, + "reward": 1.0625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 1857 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.875, + "epoch": 0.24773333333333333, + "grad_norm": 5.813661098480225, + "kl": 0.1943359375, + "learning_rate": 8.761333333333333e-07, + "loss": 0.0078, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1858 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.0625, + "epoch": 0.24786666666666668, + "grad_norm": 6.7792863845825195, + "kl": 0.248046875, + "learning_rate": 8.760666666666667e-07, + "loss": 0.0099, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1859 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.0625, + "epoch": 0.248, + "grad_norm": 7.612885475158691, + "kl": 0.23046875, + "learning_rate": 8.76e-07, + "loss": 0.0092, + "reward": 1.6875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 1860 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.3125, + "epoch": 0.24813333333333334, + "grad_norm": 10.490886688232422, + "kl": 0.5771484375, + "learning_rate": 8.759333333333332e-07, + "loss": 0.0231, + "reward": 1.1875, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 1861 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.875, + "epoch": 0.24826666666666666, + "grad_norm": 0.5676988363265991, + "kl": 0.2626953125, + "learning_rate": 8.758666666666666e-07, + "loss": 0.0105, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1862 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.0625, + "epoch": 0.2484, + "grad_norm": 4.743435382843018, + "kl": 0.20166015625, + "learning_rate": 8.757999999999999e-07, + "loss": 0.0081, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1863 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.0, + "epoch": 0.24853333333333333, + "grad_norm": 13.219741821289062, + "kl": 0.263671875, + "learning_rate": 8.757333333333333e-07, + "loss": 0.0105, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1864 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.125, + "epoch": 0.24866666666666667, + "grad_norm": 8.825364112854004, + "kl": 0.265625, + "learning_rate": 8.756666666666666e-07, + "loss": 0.0106, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 1865 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.3125, + "epoch": 0.2488, + "grad_norm": 9.34105396270752, + "kl": 0.2275390625, + "learning_rate": 8.756e-07, + "loss": 0.0091, + "reward": 1.625, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 1866 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.0, + "epoch": 0.24893333333333334, + "grad_norm": 11.797669410705566, + "kl": 0.25439453125, + "learning_rate": 8.755333333333333e-07, + "loss": 0.0102, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1867 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.125, + "epoch": 0.24906666666666666, + "grad_norm": 7.065891265869141, + "kl": 0.2890625, + "learning_rate": 8.754666666666666e-07, + "loss": 0.0116, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1868 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.5, + "epoch": 0.2492, + "grad_norm": 8.153080940246582, + "kl": 0.17724609375, + "learning_rate": 8.753999999999999e-07, + "loss": 0.0071, + "reward": 1.5625, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 1869 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.9375, + "epoch": 0.24933333333333332, + "grad_norm": 4.86957311630249, + "kl": 0.2666015625, + "learning_rate": 8.753333333333332e-07, + "loss": 0.0107, + "reward": 0.9375, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.875, + "step": 1870 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.875, + "epoch": 0.24946666666666667, + "grad_norm": 5.944453716278076, + "kl": 0.27197265625, + "learning_rate": 8.752666666666666e-07, + "loss": 0.0109, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1871 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.5, + "epoch": 0.2496, + "grad_norm": 95.79988861083984, + "kl": 0.22265625, + "learning_rate": 8.751999999999999e-07, + "loss": 0.0089, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 1872 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.0, + "epoch": 0.24973333333333333, + "grad_norm": 4.746254920959473, + "kl": 0.17041015625, + "learning_rate": 8.751333333333333e-07, + "loss": 0.0068, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.875, + "step": 1873 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.6875, + "epoch": 0.24986666666666665, + "grad_norm": 11.774946212768555, + "kl": 0.23876953125, + "learning_rate": 8.750666666666666e-07, + "loss": 0.0095, + "reward": 1.5625, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 1874 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.75, + "epoch": 0.25, + "grad_norm": 6.178918838500977, + "kl": 0.27099609375, + "learning_rate": 8.75e-07, + "loss": 0.0108, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.9375, + "epoch": 0.2501333333333333, + "grad_norm": 9.62911319732666, + "kl": 0.349609375, + "learning_rate": 8.749333333333333e-07, + "loss": 0.014, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 1876 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.375, + "epoch": 0.2502666666666667, + "grad_norm": 7.626475811004639, + "kl": 0.2216796875, + "learning_rate": 8.748666666666667e-07, + "loss": 0.0089, + "reward": 1.1875, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 1877 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.25, + "epoch": 0.2504, + "grad_norm": 6.049952983856201, + "kl": 0.201171875, + "learning_rate": 8.748e-07, + "loss": 0.008, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 1878 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.0625, + "epoch": 0.25053333333333333, + "grad_norm": 7.980217933654785, + "kl": 0.21728515625, + "learning_rate": 8.747333333333334e-07, + "loss": 0.0087, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1879 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.6875, + "epoch": 0.25066666666666665, + "grad_norm": 3.6953420639038086, + "kl": 0.166259765625, + "learning_rate": 8.746666666666667e-07, + "loss": 0.0066, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1880 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.25, + "epoch": 0.2508, + "grad_norm": 7.712893486022949, + "kl": 0.3896484375, + "learning_rate": 8.746e-07, + "loss": 0.0156, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1881 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.125, + "epoch": 0.25093333333333334, + "grad_norm": 6.770890235900879, + "kl": 0.21484375, + "learning_rate": 8.745333333333334e-07, + "loss": 0.0086, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1882 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.9375, + "epoch": 0.25106666666666666, + "grad_norm": 4.971492290496826, + "kl": 0.3369140625, + "learning_rate": 8.744666666666665e-07, + "loss": 0.0135, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1883 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.6875, + "epoch": 0.2512, + "grad_norm": 9.574129104614258, + "kl": 0.267578125, + "learning_rate": 8.743999999999999e-07, + "loss": 0.0107, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1884 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.875, + "epoch": 0.25133333333333335, + "grad_norm": 5.762954235076904, + "kl": 0.349609375, + "learning_rate": 8.743333333333332e-07, + "loss": 0.014, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1885 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.625, + "epoch": 0.25146666666666667, + "grad_norm": 13.718165397644043, + "kl": 0.234375, + "learning_rate": 8.742666666666666e-07, + "loss": 0.0094, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 1886 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.125, + "epoch": 0.2516, + "grad_norm": 7.045107364654541, + "kl": 0.2763671875, + "learning_rate": 8.741999999999999e-07, + "loss": 0.011, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 1887 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.625, + "epoch": 0.2517333333333333, + "grad_norm": 16.704748153686523, + "kl": 0.529296875, + "learning_rate": 8.741333333333333e-07, + "loss": 0.0212, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 1888 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.5, + "epoch": 0.2518666666666667, + "grad_norm": 7.901737213134766, + "kl": 0.2734375, + "learning_rate": 8.740666666666666e-07, + "loss": 0.011, + "reward": 1.4375, + "reward_std": 0.7708148658275604, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.8125, + "step": 1889 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.1875, + "epoch": 0.252, + "grad_norm": 13.107185363769531, + "kl": 0.2529296875, + "learning_rate": 8.739999999999999e-07, + "loss": 0.0101, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 1890 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.0625, + "epoch": 0.2521333333333333, + "grad_norm": 6.397396087646484, + "kl": 0.34521484375, + "learning_rate": 8.739333333333333e-07, + "loss": 0.0138, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1891 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.8125, + "epoch": 0.25226666666666664, + "grad_norm": 10.451923370361328, + "kl": 0.21923828125, + "learning_rate": 8.738666666666666e-07, + "loss": 0.0088, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1892 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.5625, + "epoch": 0.2524, + "grad_norm": 7.056065559387207, + "kl": 0.23291015625, + "learning_rate": 8.738e-07, + "loss": 0.0093, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1893 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.1875, + "epoch": 0.25253333333333333, + "grad_norm": 18.992799758911133, + "kl": 0.27490234375, + "learning_rate": 8.737333333333333e-07, + "loss": 0.011, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1894 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.5625, + "epoch": 0.25266666666666665, + "grad_norm": 7.8030571937561035, + "kl": 0.18212890625, + "learning_rate": 8.736666666666667e-07, + "loss": 0.0073, + "reward": 0.9375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.875, + "step": 1895 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.875, + "epoch": 0.2528, + "grad_norm": 0.45215657353401184, + "kl": 0.25732421875, + "learning_rate": 8.736e-07, + "loss": 0.0103, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1896 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.25, + "epoch": 0.25293333333333334, + "grad_norm": 8.705466270446777, + "kl": 0.24755859375, + "learning_rate": 8.735333333333334e-07, + "loss": 0.0099, + "reward": 1.5625, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 1897 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.375, + "epoch": 0.25306666666666666, + "grad_norm": 6.435238361358643, + "kl": 0.20947265625, + "learning_rate": 8.734666666666666e-07, + "loss": 0.0084, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1898 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.0625, + "epoch": 0.2532, + "grad_norm": 7.177262306213379, + "kl": 0.275390625, + "learning_rate": 8.733999999999999e-07, + "loss": 0.011, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 1899 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.375, + "epoch": 0.25333333333333335, + "grad_norm": 3.7754740715026855, + "kl": 0.28955078125, + "learning_rate": 8.733333333333333e-07, + "loss": 0.0116, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1900 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.375, + "epoch": 0.2534666666666667, + "grad_norm": 0.3525218367576599, + "kl": 0.1787109375, + "learning_rate": 8.732666666666666e-07, + "loss": 0.0072, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1901 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.625, + "epoch": 0.2536, + "grad_norm": 21.716527938842773, + "kl": 0.3798828125, + "learning_rate": 8.732e-07, + "loss": 0.0152, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1902 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.75, + "epoch": 0.2537333333333333, + "grad_norm": 17.33063507080078, + "kl": 0.43212890625, + "learning_rate": 8.731333333333333e-07, + "loss": 0.0173, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1903 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.8125, + "epoch": 0.2538666666666667, + "grad_norm": 10.97245979309082, + "kl": 0.4072265625, + "learning_rate": 8.730666666666666e-07, + "loss": 0.0163, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1904 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.1875, + "epoch": 0.254, + "grad_norm": 9.174925804138184, + "kl": 0.28466796875, + "learning_rate": 8.729999999999999e-07, + "loss": 0.0114, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1905 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.6875, + "epoch": 0.2541333333333333, + "grad_norm": 7.257027626037598, + "kl": 0.1826171875, + "learning_rate": 8.729333333333333e-07, + "loss": 0.0073, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1906 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.375, + "epoch": 0.25426666666666664, + "grad_norm": 7.586062431335449, + "kl": 0.18310546875, + "learning_rate": 8.728666666666666e-07, + "loss": 0.0073, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1907 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.375, + "epoch": 0.2544, + "grad_norm": 8.337571144104004, + "kl": 0.46484375, + "learning_rate": 8.728e-07, + "loss": 0.0186, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 1908 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.9375, + "epoch": 0.25453333333333333, + "grad_norm": 7.762795448303223, + "kl": 0.21142578125, + "learning_rate": 8.727333333333333e-07, + "loss": 0.0085, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1909 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.1875, + "epoch": 0.25466666666666665, + "grad_norm": 8.323829650878906, + "kl": 0.26171875, + "learning_rate": 8.726666666666666e-07, + "loss": 0.0105, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1910 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.0625, + "epoch": 0.2548, + "grad_norm": 5.807625770568848, + "kl": 0.185546875, + "learning_rate": 8.726e-07, + "loss": 0.0074, + "reward": 1.25, + "reward_std": 0.6452257037162781, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 1911 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.5, + "epoch": 0.25493333333333335, + "grad_norm": 0.3303307592868805, + "kl": 0.171875, + "learning_rate": 8.725333333333333e-07, + "loss": 0.0069, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1912 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.4375, + "epoch": 0.25506666666666666, + "grad_norm": 8.146562576293945, + "kl": 0.26220703125, + "learning_rate": 8.724666666666666e-07, + "loss": 0.0105, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1913 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.75, + "epoch": 0.2552, + "grad_norm": 15.001016616821289, + "kl": 0.1611328125, + "learning_rate": 8.723999999999999e-07, + "loss": 0.0064, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 1914 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.625, + "epoch": 0.25533333333333336, + "grad_norm": 7.461472988128662, + "kl": 0.22119140625, + "learning_rate": 8.723333333333333e-07, + "loss": 0.0088, + "reward": 1.3125, + "reward_std": 0.5876962244510651, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 1915 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.125, + "epoch": 0.2554666666666667, + "grad_norm": 4.2897844314575195, + "kl": 0.154296875, + "learning_rate": 8.722666666666666e-07, + "loss": 0.0062, + "reward": 1.0625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 1916 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.0625, + "epoch": 0.2556, + "grad_norm": 9.321165084838867, + "kl": 0.2265625, + "learning_rate": 8.722e-07, + "loss": 0.0091, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1917 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.0, + "epoch": 0.2557333333333333, + "grad_norm": 7.776479244232178, + "kl": 0.26123046875, + "learning_rate": 8.721333333333333e-07, + "loss": 0.0105, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1918 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.3125, + "epoch": 0.2558666666666667, + "grad_norm": 8.566431045532227, + "kl": 0.25927734375, + "learning_rate": 8.720666666666666e-07, + "loss": 0.0104, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 1919 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.6875, + "epoch": 0.256, + "grad_norm": 0.3469008803367615, + "kl": 0.14599609375, + "learning_rate": 8.72e-07, + "loss": 0.0058, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1920 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.1875, + "epoch": 0.2561333333333333, + "grad_norm": 12.105222702026367, + "kl": 0.5205078125, + "learning_rate": 8.719333333333333e-07, + "loss": 0.0208, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 1921 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.25, + "epoch": 0.25626666666666664, + "grad_norm": 5.276517391204834, + "kl": 0.23388671875, + "learning_rate": 8.718666666666667e-07, + "loss": 0.0094, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1922 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.625, + "epoch": 0.2564, + "grad_norm": 6.489184379577637, + "kl": 0.234375, + "learning_rate": 8.718e-07, + "loss": 0.0094, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 1923 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.4375, + "epoch": 0.25653333333333334, + "grad_norm": 8.473987579345703, + "kl": 0.20703125, + "learning_rate": 8.717333333333334e-07, + "loss": 0.0083, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1924 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.9375, + "epoch": 0.25666666666666665, + "grad_norm": 6.508725166320801, + "kl": 0.23193359375, + "learning_rate": 8.716666666666667e-07, + "loss": 0.0093, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1925 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.75, + "epoch": 0.2568, + "grad_norm": 8.871006965637207, + "kl": 0.19140625, + "learning_rate": 8.716e-07, + "loss": 0.0076, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1926 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.375, + "epoch": 0.25693333333333335, + "grad_norm": 1.3434852361679077, + "kl": 0.3134765625, + "learning_rate": 8.715333333333333e-07, + "loss": 0.0125, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1927 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.125, + "epoch": 0.25706666666666667, + "grad_norm": 6.415406703948975, + "kl": 0.14404296875, + "learning_rate": 8.714666666666665e-07, + "loss": 0.0058, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1928 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.5, + "epoch": 0.2572, + "grad_norm": 8.557143211364746, + "kl": 0.1396484375, + "learning_rate": 8.713999999999999e-07, + "loss": 0.0056, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1929 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.0625, + "epoch": 0.25733333333333336, + "grad_norm": 7.248201370239258, + "kl": 0.29296875, + "learning_rate": 8.713333333333332e-07, + "loss": 0.0117, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1930 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.5625, + "epoch": 0.2574666666666667, + "grad_norm": 9.786611557006836, + "kl": 0.19482421875, + "learning_rate": 8.712666666666666e-07, + "loss": 0.0078, + "reward": 1.4375, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1931 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.375, + "epoch": 0.2576, + "grad_norm": 3.940019130706787, + "kl": 0.15576171875, + "learning_rate": 8.711999999999999e-07, + "loss": 0.0062, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 1932 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.4375, + "epoch": 0.2577333333333333, + "grad_norm": 9.539604187011719, + "kl": 0.2529296875, + "learning_rate": 8.711333333333333e-07, + "loss": 0.0101, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1933 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.625, + "epoch": 0.2578666666666667, + "grad_norm": 14.664939880371094, + "kl": 0.22509765625, + "learning_rate": 8.710666666666666e-07, + "loss": 0.009, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1934 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.5, + "epoch": 0.258, + "grad_norm": 10.040611267089844, + "kl": 0.24658203125, + "learning_rate": 8.71e-07, + "loss": 0.0099, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1935 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.125, + "epoch": 0.2581333333333333, + "grad_norm": 7.808972358703613, + "kl": 0.1630859375, + "learning_rate": 8.709333333333333e-07, + "loss": 0.0065, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1936 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.1875, + "epoch": 0.25826666666666664, + "grad_norm": 7.630433559417725, + "kl": 0.1845703125, + "learning_rate": 8.708666666666666e-07, + "loss": 0.0074, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1937 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.5625, + "epoch": 0.2584, + "grad_norm": 4.176311016082764, + "kl": 0.23095703125, + "learning_rate": 8.708e-07, + "loss": 0.0092, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1938 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.3125, + "epoch": 0.25853333333333334, + "grad_norm": 5.911608695983887, + "kl": 0.314453125, + "learning_rate": 8.707333333333333e-07, + "loss": 0.0126, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1939 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.375, + "epoch": 0.25866666666666666, + "grad_norm": 5.0877909660339355, + "kl": 0.201171875, + "learning_rate": 8.706666666666667e-07, + "loss": 0.008, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1940 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.375, + "epoch": 0.2588, + "grad_norm": 7.198719024658203, + "kl": 0.23876953125, + "learning_rate": 8.706e-07, + "loss": 0.0095, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 1941 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.5625, + "epoch": 0.25893333333333335, + "grad_norm": 5.906726360321045, + "kl": 0.18603515625, + "learning_rate": 8.705333333333334e-07, + "loss": 0.0074, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1942 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.5, + "epoch": 0.25906666666666667, + "grad_norm": 10.467079162597656, + "kl": 0.4638671875, + "learning_rate": 8.704666666666667e-07, + "loss": 0.0185, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 1943 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.75, + "epoch": 0.2592, + "grad_norm": 11.78520393371582, + "kl": 0.22998046875, + "learning_rate": 8.704e-07, + "loss": 0.0092, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1944 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.8125, + "epoch": 0.25933333333333336, + "grad_norm": 30.489078521728516, + "kl": 1.2568359375, + "learning_rate": 8.703333333333333e-07, + "loss": 0.0505, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 1945 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.5625, + "epoch": 0.2594666666666667, + "grad_norm": 0.4018608033657074, + "kl": 0.23828125, + "learning_rate": 8.702666666666665e-07, + "loss": 0.0095, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1946 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.6875, + "epoch": 0.2596, + "grad_norm": 0.5018933415412903, + "kl": 0.267578125, + "learning_rate": 8.701999999999999e-07, + "loss": 0.0107, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1947 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.125, + "epoch": 0.2597333333333333, + "grad_norm": 27.167707443237305, + "kl": 1.02978515625, + "learning_rate": 8.701333333333332e-07, + "loss": 0.0411, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1948 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.0, + "epoch": 0.2598666666666667, + "grad_norm": 0.7724583745002747, + "kl": 0.306640625, + "learning_rate": 8.700666666666666e-07, + "loss": 0.0123, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1949 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.4375, + "epoch": 0.26, + "grad_norm": 5.906163692474365, + "kl": 0.14990234375, + "learning_rate": 8.699999999999999e-07, + "loss": 0.006, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1950 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.8125, + "epoch": 0.2601333333333333, + "grad_norm": 7.992588996887207, + "kl": 0.25146484375, + "learning_rate": 8.699333333333333e-07, + "loss": 0.01, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1951 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.0625, + "epoch": 0.26026666666666665, + "grad_norm": 14.287310600280762, + "kl": 0.18359375, + "learning_rate": 8.698666666666666e-07, + "loss": 0.0073, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1952 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.9375, + "epoch": 0.2604, + "grad_norm": 4.754171371459961, + "kl": 0.1728515625, + "learning_rate": 8.698e-07, + "loss": 0.0069, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 1953 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.375, + "epoch": 0.26053333333333334, + "grad_norm": 2.644566535949707, + "kl": 0.20068359375, + "learning_rate": 8.697333333333333e-07, + "loss": 0.008, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1954 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.625, + "epoch": 0.26066666666666666, + "grad_norm": 10.313946723937988, + "kl": 0.234375, + "learning_rate": 8.696666666666667e-07, + "loss": 0.0094, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1955 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.25, + "epoch": 0.2608, + "grad_norm": 8.953773498535156, + "kl": 0.20751953125, + "learning_rate": 8.696e-07, + "loss": 0.0083, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 1956 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.4375, + "epoch": 0.26093333333333335, + "grad_norm": 7.360296726226807, + "kl": 0.2236328125, + "learning_rate": 8.695333333333333e-07, + "loss": 0.0089, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1957 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.25, + "epoch": 0.26106666666666667, + "grad_norm": 9.92516803741455, + "kl": 0.1630859375, + "learning_rate": 8.694666666666667e-07, + "loss": 0.0065, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1958 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.9375, + "epoch": 0.2612, + "grad_norm": 5.142210483551025, + "kl": 0.1923828125, + "learning_rate": 8.693999999999999e-07, + "loss": 0.0077, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 1959 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.0625, + "epoch": 0.2613333333333333, + "grad_norm": 173.49578857421875, + "kl": 0.1923828125, + "learning_rate": 8.693333333333333e-07, + "loss": 0.0077, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1960 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.875, + "epoch": 0.2614666666666667, + "grad_norm": 14.601465225219727, + "kl": 0.380859375, + "learning_rate": 8.692666666666666e-07, + "loss": 0.0152, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1961 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.5625, + "epoch": 0.2616, + "grad_norm": 0.7311822772026062, + "kl": 0.27392578125, + "learning_rate": 8.692e-07, + "loss": 0.011, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1962 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.625, + "epoch": 0.2617333333333333, + "grad_norm": 0.5769819617271423, + "kl": 0.3447265625, + "learning_rate": 8.691333333333333e-07, + "loss": 0.0138, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1963 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.6875, + "epoch": 0.2618666666666667, + "grad_norm": 7.104939937591553, + "kl": 0.13427734375, + "learning_rate": 8.690666666666667e-07, + "loss": 0.0054, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1964 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.5, + "epoch": 0.262, + "grad_norm": 12.82795238494873, + "kl": 0.14794921875, + "learning_rate": 8.69e-07, + "loss": 0.0059, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 1965 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.0, + "epoch": 0.26213333333333333, + "grad_norm": 460.6964111328125, + "kl": 0.38232421875, + "learning_rate": 8.689333333333333e-07, + "loss": 0.0153, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 1966 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.875, + "epoch": 0.26226666666666665, + "grad_norm": 15.486150741577148, + "kl": 0.16552734375, + "learning_rate": 8.688666666666667e-07, + "loss": 0.0066, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1967 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.8125, + "epoch": 0.2624, + "grad_norm": 7.891584396362305, + "kl": 0.2041015625, + "learning_rate": 8.687999999999999e-07, + "loss": 0.0082, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 1968 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.0625, + "epoch": 0.26253333333333334, + "grad_norm": 7.161901473999023, + "kl": 0.1875, + "learning_rate": 8.687333333333333e-07, + "loss": 0.0075, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1969 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.1875, + "epoch": 0.26266666666666666, + "grad_norm": 48.19637680053711, + "kl": 0.1826171875, + "learning_rate": 8.686666666666666e-07, + "loss": 0.0073, + "reward": 1.25, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 1970 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.1875, + "epoch": 0.2628, + "grad_norm": 8.240350723266602, + "kl": 0.22705078125, + "learning_rate": 8.686e-07, + "loss": 0.0091, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1971 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.0625, + "epoch": 0.26293333333333335, + "grad_norm": 7.279357433319092, + "kl": 0.22900390625, + "learning_rate": 8.685333333333333e-07, + "loss": 0.0092, + "reward": 1.1875, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 1972 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.5, + "epoch": 0.26306666666666667, + "grad_norm": 8.518050193786621, + "kl": 0.22119140625, + "learning_rate": 8.684666666666667e-07, + "loss": 0.0088, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1973 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.0625, + "epoch": 0.2632, + "grad_norm": 5.052618503570557, + "kl": 0.18310546875, + "learning_rate": 8.683999999999999e-07, + "loss": 0.0073, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 1974 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.4375, + "epoch": 0.2633333333333333, + "grad_norm": 5.572102069854736, + "kl": 0.19677734375, + "learning_rate": 8.683333333333332e-07, + "loss": 0.0079, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 1975 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.6875, + "epoch": 0.2634666666666667, + "grad_norm": 8.57811164855957, + "kl": 0.28515625, + "learning_rate": 8.682666666666666e-07, + "loss": 0.0114, + "reward": 1.375, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1976 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.3125, + "epoch": 0.2636, + "grad_norm": 7.9260125160217285, + "kl": 0.30908203125, + "learning_rate": 8.681999999999999e-07, + "loss": 0.0123, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 1977 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.125, + "epoch": 0.2637333333333333, + "grad_norm": 6.708944797515869, + "kl": 0.2001953125, + "learning_rate": 8.681333333333333e-07, + "loss": 0.008, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 1978 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.75, + "epoch": 0.2638666666666667, + "grad_norm": 10.357047080993652, + "kl": 0.234375, + "learning_rate": 8.680666666666666e-07, + "loss": 0.0094, + "reward": 1.375, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1979 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.875, + "epoch": 0.264, + "grad_norm": 23.414583206176758, + "kl": 0.2646484375, + "learning_rate": 8.68e-07, + "loss": 0.0106, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 1980 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.3125, + "epoch": 0.26413333333333333, + "grad_norm": 6.562000274658203, + "kl": 0.333984375, + "learning_rate": 8.679333333333333e-07, + "loss": 0.0134, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 1981 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.9375, + "epoch": 0.26426666666666665, + "grad_norm": 9.268474578857422, + "kl": 0.24609375, + "learning_rate": 8.678666666666667e-07, + "loss": 0.0098, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1982 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.625, + "epoch": 0.2644, + "grad_norm": 5.350624084472656, + "kl": 0.16650390625, + "learning_rate": 8.678e-07, + "loss": 0.0067, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1983 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.0625, + "epoch": 0.26453333333333334, + "grad_norm": 5.17426872253418, + "kl": 0.15185546875, + "learning_rate": 8.677333333333333e-07, + "loss": 0.0061, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 1984 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.25, + "epoch": 0.26466666666666666, + "grad_norm": 9.512858390808105, + "kl": 0.2783203125, + "learning_rate": 8.676666666666667e-07, + "loss": 0.0111, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 1985 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.75, + "epoch": 0.2648, + "grad_norm": 0.5493490099906921, + "kl": 0.251953125, + "learning_rate": 8.676e-07, + "loss": 0.0101, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1986 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.4375, + "epoch": 0.26493333333333335, + "grad_norm": 9.250264167785645, + "kl": 0.296875, + "learning_rate": 8.675333333333334e-07, + "loss": 0.0119, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1987 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.8125, + "epoch": 0.2650666666666667, + "grad_norm": 6.196630477905273, + "kl": 0.20068359375, + "learning_rate": 8.674666666666667e-07, + "loss": 0.008, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 1988 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.125, + "epoch": 0.2652, + "grad_norm": 7.848118782043457, + "kl": 0.220703125, + "learning_rate": 8.673999999999999e-07, + "loss": 0.0088, + "reward": 1.4375, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 1989 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.4375, + "epoch": 0.2653333333333333, + "grad_norm": 0.5232159495353699, + "kl": 0.35546875, + "learning_rate": 8.673333333333332e-07, + "loss": 0.0142, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1990 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.5625, + "epoch": 0.2654666666666667, + "grad_norm": 9.473394393920898, + "kl": 0.22998046875, + "learning_rate": 8.672666666666666e-07, + "loss": 0.0092, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 1991 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.3125, + "epoch": 0.2656, + "grad_norm": 6.418197154998779, + "kl": 0.3994140625, + "learning_rate": 8.671999999999999e-07, + "loss": 0.016, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 1992 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.9375, + "epoch": 0.2657333333333333, + "grad_norm": 5.180598258972168, + "kl": 0.19775390625, + "learning_rate": 8.671333333333332e-07, + "loss": 0.0079, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 1993 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.875, + "epoch": 0.26586666666666664, + "grad_norm": 4.642916202545166, + "kl": 0.2314453125, + "learning_rate": 8.670666666666666e-07, + "loss": 0.0092, + "reward": 1.6875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 1994 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.5625, + "epoch": 0.266, + "grad_norm": 6.003794193267822, + "kl": 0.2353515625, + "learning_rate": 8.669999999999999e-07, + "loss": 0.0094, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 1995 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.9375, + "epoch": 0.26613333333333333, + "grad_norm": 4.156412601470947, + "kl": 0.3203125, + "learning_rate": 8.669333333333333e-07, + "loss": 0.0128, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 1996 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.5, + "epoch": 0.26626666666666665, + "grad_norm": 3.9728124141693115, + "kl": 0.1650390625, + "learning_rate": 8.668666666666666e-07, + "loss": 0.0066, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1997 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.3125, + "epoch": 0.2664, + "grad_norm": 6.36150598526001, + "kl": 0.197265625, + "learning_rate": 8.668e-07, + "loss": 0.0079, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 1998 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.8125, + "epoch": 0.26653333333333334, + "grad_norm": 7.929385662078857, + "kl": 0.20703125, + "learning_rate": 8.667333333333333e-07, + "loss": 0.0083, + "reward": 1.625, + "reward_std": 0.7315178513526917, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 1999 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.4375, + "epoch": 0.26666666666666666, + "grad_norm": 16.613740921020508, + "kl": 0.17724609375, + "learning_rate": 8.666666666666667e-07, + "loss": 0.0071, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2000 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.625, + "epoch": 0.2668, + "grad_norm": 6.721795558929443, + "kl": 0.2529296875, + "learning_rate": 8.666e-07, + "loss": 0.0101, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 2001 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.875, + "epoch": 0.26693333333333336, + "grad_norm": 6.458136081695557, + "kl": 0.150390625, + "learning_rate": 8.665333333333334e-07, + "loss": 0.006, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2002 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.75, + "epoch": 0.2670666666666667, + "grad_norm": 13.635089874267578, + "kl": 0.27392578125, + "learning_rate": 8.664666666666667e-07, + "loss": 0.011, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 2003 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.6875, + "epoch": 0.2672, + "grad_norm": 0.42088577151298523, + "kl": 0.22412109375, + "learning_rate": 8.663999999999999e-07, + "loss": 0.009, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2004 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.375, + "epoch": 0.2673333333333333, + "grad_norm": 8.436193466186523, + "kl": 0.23193359375, + "learning_rate": 8.663333333333333e-07, + "loss": 0.0093, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2005 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.25, + "epoch": 0.2674666666666667, + "grad_norm": 4.29108190536499, + "kl": 0.14990234375, + "learning_rate": 8.662666666666666e-07, + "loss": 0.006, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 2006 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.875, + "epoch": 0.2676, + "grad_norm": 4.8528594970703125, + "kl": 0.2119140625, + "learning_rate": 8.662e-07, + "loss": 0.0085, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2007 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.6875, + "epoch": 0.2677333333333333, + "grad_norm": 3.972073554992676, + "kl": 0.16162109375, + "learning_rate": 8.661333333333333e-07, + "loss": 0.0065, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2008 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.8125, + "epoch": 0.26786666666666664, + "grad_norm": 16.25084114074707, + "kl": 0.1669921875, + "learning_rate": 8.660666666666667e-07, + "loss": 0.0067, + "reward": 1.375, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2009 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.625, + "epoch": 0.268, + "grad_norm": 11.63433837890625, + "kl": 0.27978515625, + "learning_rate": 8.659999999999999e-07, + "loss": 0.0112, + "reward": 1.25, + "reward_std": 0.6924468874931335, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 2010 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.5, + "epoch": 0.26813333333333333, + "grad_norm": 7.487127304077148, + "kl": 0.2412109375, + "learning_rate": 8.659333333333333e-07, + "loss": 0.0097, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2011 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.125, + "epoch": 0.26826666666666665, + "grad_norm": 5.855743885040283, + "kl": 0.28955078125, + "learning_rate": 8.658666666666666e-07, + "loss": 0.0116, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 2012 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.25, + "epoch": 0.2684, + "grad_norm": 8.574646949768066, + "kl": 0.1943359375, + "learning_rate": 8.657999999999999e-07, + "loss": 0.0078, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2013 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.0, + "epoch": 0.26853333333333335, + "grad_norm": 14.858548164367676, + "kl": 0.34375, + "learning_rate": 8.657333333333333e-07, + "loss": 0.0137, + "reward": 1.0625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 2014 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.9375, + "epoch": 0.26866666666666666, + "grad_norm": 6.129873275756836, + "kl": 0.28955078125, + "learning_rate": 8.656666666666666e-07, + "loss": 0.0116, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2015 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.1875, + "epoch": 0.2688, + "grad_norm": 7.884200572967529, + "kl": 0.25244140625, + "learning_rate": 8.656e-07, + "loss": 0.0101, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2016 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.6875, + "epoch": 0.26893333333333336, + "grad_norm": 4.736770153045654, + "kl": 0.29443359375, + "learning_rate": 8.655333333333333e-07, + "loss": 0.0118, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2017 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.5625, + "epoch": 0.2690666666666667, + "grad_norm": 9.171907424926758, + "kl": 0.25390625, + "learning_rate": 8.654666666666667e-07, + "loss": 0.0102, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2018 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.625, + "epoch": 0.2692, + "grad_norm": 334.3135681152344, + "kl": 0.29541015625, + "learning_rate": 8.653999999999999e-07, + "loss": 0.0118, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2019 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.6875, + "epoch": 0.2693333333333333, + "grad_norm": 4.940028190612793, + "kl": 0.3671875, + "learning_rate": 8.653333333333333e-07, + "loss": 0.0147, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 2020 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.6875, + "epoch": 0.2694666666666667, + "grad_norm": 5.5719757080078125, + "kl": 0.255859375, + "learning_rate": 8.652666666666666e-07, + "loss": 0.0102, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 2021 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.8125, + "epoch": 0.2696, + "grad_norm": 5.7123026847839355, + "kl": 0.318359375, + "learning_rate": 8.651999999999999e-07, + "loss": 0.0127, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2022 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.5625, + "epoch": 0.2697333333333333, + "grad_norm": 7.198159217834473, + "kl": 0.19189453125, + "learning_rate": 8.651333333333333e-07, + "loss": 0.0077, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2023 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.5, + "epoch": 0.26986666666666664, + "grad_norm": 5.861291408538818, + "kl": 0.33935546875, + "learning_rate": 8.650666666666666e-07, + "loss": 0.0136, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2024 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.625, + "epoch": 0.27, + "grad_norm": 7.319359302520752, + "kl": 0.5302734375, + "learning_rate": 8.65e-07, + "loss": 0.0212, + "reward": 1.625, + "reward_std": 0.4432026147842407, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 2025 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.75, + "epoch": 0.27013333333333334, + "grad_norm": 0.8409649729728699, + "kl": 0.400390625, + "learning_rate": 8.649333333333333e-07, + "loss": 0.016, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2026 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.375, + "epoch": 0.27026666666666666, + "grad_norm": 5.7812700271606445, + "kl": 0.21728515625, + "learning_rate": 8.648666666666667e-07, + "loss": 0.0087, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2027 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.0, + "epoch": 0.2704, + "grad_norm": 18.105709075927734, + "kl": 0.2265625, + "learning_rate": 8.648e-07, + "loss": 0.0091, + "reward": 1.375, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2028 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.0625, + "epoch": 0.27053333333333335, + "grad_norm": 10.770057678222656, + "kl": 0.54150390625, + "learning_rate": 8.647333333333334e-07, + "loss": 0.0216, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 2029 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.1875, + "epoch": 0.27066666666666667, + "grad_norm": 6.141766548156738, + "kl": 0.2490234375, + "learning_rate": 8.646666666666667e-07, + "loss": 0.01, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 2030 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.9375, + "epoch": 0.2708, + "grad_norm": 7.347733974456787, + "kl": 0.18017578125, + "learning_rate": 8.645999999999999e-07, + "loss": 0.0072, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2031 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.0625, + "epoch": 0.27093333333333336, + "grad_norm": 14.669296264648438, + "kl": 0.1904296875, + "learning_rate": 8.645333333333333e-07, + "loss": 0.0076, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2032 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.25, + "epoch": 0.2710666666666667, + "grad_norm": 67.95718383789062, + "kl": 0.2890625, + "learning_rate": 8.644666666666666e-07, + "loss": 0.0116, + "reward": 1.375, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2033 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.6875, + "epoch": 0.2712, + "grad_norm": 11.164050102233887, + "kl": 0.212890625, + "learning_rate": 8.643999999999999e-07, + "loss": 0.0085, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2034 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.3125, + "epoch": 0.2713333333333333, + "grad_norm": 6.809377193450928, + "kl": 0.19873046875, + "learning_rate": 8.643333333333332e-07, + "loss": 0.008, + "reward": 1.25, + "reward_std": 0.6760360598564148, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 2035 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.5625, + "epoch": 0.2714666666666667, + "grad_norm": 9.248518943786621, + "kl": 0.28125, + "learning_rate": 8.642666666666666e-07, + "loss": 0.0113, + "reward": 1.625, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 2036 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.75, + "epoch": 0.2716, + "grad_norm": 32.278743743896484, + "kl": 1.66357421875, + "learning_rate": 8.641999999999999e-07, + "loss": 0.0664, + "reward": 1.25, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 2037 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.0625, + "epoch": 0.2717333333333333, + "grad_norm": 11.661160469055176, + "kl": 0.31640625, + "learning_rate": 8.641333333333333e-07, + "loss": 0.0126, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2038 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.8125, + "epoch": 0.27186666666666665, + "grad_norm": 7.33292818069458, + "kl": 0.15625, + "learning_rate": 8.640666666666666e-07, + "loss": 0.0062, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2039 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.75, + "epoch": 0.272, + "grad_norm": 9.041893005371094, + "kl": 0.2578125, + "learning_rate": 8.639999999999999e-07, + "loss": 0.0103, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2040 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.25, + "epoch": 0.27213333333333334, + "grad_norm": 5.753262996673584, + "kl": 0.23876953125, + "learning_rate": 8.639333333333333e-07, + "loss": 0.0096, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2041 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.5625, + "epoch": 0.27226666666666666, + "grad_norm": 10.983271598815918, + "kl": 0.2568359375, + "learning_rate": 8.638666666666666e-07, + "loss": 0.0103, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 2042 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.125, + "epoch": 0.2724, + "grad_norm": 13.175422668457031, + "kl": 0.3125, + "learning_rate": 8.638e-07, + "loss": 0.0125, + "reward": 1.5, + "reward_std": 0.5940381735563278, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 2043 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.125, + "epoch": 0.27253333333333335, + "grad_norm": 12.269314765930176, + "kl": 0.4248046875, + "learning_rate": 8.637333333333333e-07, + "loss": 0.017, + "reward": 1.3125, + "reward_std": 0.7952259480953217, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.8125, + "step": 2044 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.9375, + "epoch": 0.27266666666666667, + "grad_norm": 7.193334102630615, + "kl": 0.220703125, + "learning_rate": 8.636666666666667e-07, + "loss": 0.0088, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2045 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.25, + "epoch": 0.2728, + "grad_norm": 7.207674980163574, + "kl": 0.47265625, + "learning_rate": 8.636e-07, + "loss": 0.0189, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 2046 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.375, + "epoch": 0.2729333333333333, + "grad_norm": 8.261568069458008, + "kl": 0.27490234375, + "learning_rate": 8.635333333333334e-07, + "loss": 0.011, + "reward": 1.25, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 2047 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.625, + "epoch": 0.2730666666666667, + "grad_norm": 7.75327205657959, + "kl": 0.24609375, + "learning_rate": 8.634666666666667e-07, + "loss": 0.0098, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2048 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.4375, + "epoch": 0.2732, + "grad_norm": 5.592901706695557, + "kl": 0.21923828125, + "learning_rate": 8.633999999999999e-07, + "loss": 0.0088, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 2049 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.25, + "epoch": 0.2733333333333333, + "grad_norm": 19.466283798217773, + "kl": 0.2255859375, + "learning_rate": 8.633333333333333e-07, + "loss": 0.009, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2050 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.5625, + "epoch": 0.2734666666666667, + "grad_norm": 12.07862377166748, + "kl": 0.3193359375, + "learning_rate": 8.632666666666665e-07, + "loss": 0.0128, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 2051 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.1875, + "epoch": 0.2736, + "grad_norm": 8.332717895507812, + "kl": 0.1845703125, + "learning_rate": 8.632e-07, + "loss": 0.0074, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2052 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.9375, + "epoch": 0.27373333333333333, + "grad_norm": 9.004755973815918, + "kl": 0.24951171875, + "learning_rate": 8.631333333333332e-07, + "loss": 0.01, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2053 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.9375, + "epoch": 0.27386666666666665, + "grad_norm": 4.390677452087402, + "kl": 0.22900390625, + "learning_rate": 8.630666666666666e-07, + "loss": 0.0091, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2054 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.875, + "epoch": 0.274, + "grad_norm": 6.389697074890137, + "kl": 0.287109375, + "learning_rate": 8.629999999999999e-07, + "loss": 0.0115, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2055 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.375, + "epoch": 0.27413333333333334, + "grad_norm": 62.19599914550781, + "kl": 0.18359375, + "learning_rate": 8.629333333333333e-07, + "loss": 0.0073, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2056 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.1875, + "epoch": 0.27426666666666666, + "grad_norm": 10.358439445495605, + "kl": 0.3505859375, + "learning_rate": 8.628666666666666e-07, + "loss": 0.014, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 2057 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.875, + "epoch": 0.2744, + "grad_norm": 10.03878402709961, + "kl": 0.26123046875, + "learning_rate": 8.628e-07, + "loss": 0.0104, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2058 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.1875, + "epoch": 0.27453333333333335, + "grad_norm": 14.644375801086426, + "kl": 0.203125, + "learning_rate": 8.627333333333333e-07, + "loss": 0.0081, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2059 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.8125, + "epoch": 0.27466666666666667, + "grad_norm": 10.22789478302002, + "kl": 0.2529296875, + "learning_rate": 8.626666666666666e-07, + "loss": 0.0101, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2060 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.25, + "epoch": 0.2748, + "grad_norm": 10.2930908203125, + "kl": 0.255859375, + "learning_rate": 8.626e-07, + "loss": 0.0102, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2061 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.5, + "epoch": 0.2749333333333333, + "grad_norm": 7.3130083084106445, + "kl": 0.28271484375, + "learning_rate": 8.625333333333333e-07, + "loss": 0.0113, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2062 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.6875, + "epoch": 0.2750666666666667, + "grad_norm": 105.99554443359375, + "kl": 0.220703125, + "learning_rate": 8.624666666666667e-07, + "loss": 0.0088, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2063 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.3125, + "epoch": 0.2752, + "grad_norm": 9.192728042602539, + "kl": 0.3173828125, + "learning_rate": 8.624e-07, + "loss": 0.0127, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2064 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.3125, + "epoch": 0.2753333333333333, + "grad_norm": 7.4202880859375, + "kl": 0.1767578125, + "learning_rate": 8.623333333333333e-07, + "loss": 0.0071, + "reward": 1.1875, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 2065 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.125, + "epoch": 0.2754666666666667, + "grad_norm": 7.774544715881348, + "kl": 0.22998046875, + "learning_rate": 8.622666666666666e-07, + "loss": 0.0092, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2066 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.0625, + "epoch": 0.2756, + "grad_norm": 9.321584701538086, + "kl": 0.234375, + "learning_rate": 8.622e-07, + "loss": 0.0094, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2067 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.875, + "epoch": 0.27573333333333333, + "grad_norm": 11.320101737976074, + "kl": 0.4208984375, + "learning_rate": 8.621333333333333e-07, + "loss": 0.0168, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2068 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.875, + "epoch": 0.27586666666666665, + "grad_norm": 6.756550312042236, + "kl": 0.178955078125, + "learning_rate": 8.620666666666666e-07, + "loss": 0.0072, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2069 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.5, + "epoch": 0.276, + "grad_norm": 18.792545318603516, + "kl": 0.15625, + "learning_rate": 8.62e-07, + "loss": 0.0062, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2070 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.6875, + "epoch": 0.27613333333333334, + "grad_norm": 7.668103218078613, + "kl": 0.20166015625, + "learning_rate": 8.619333333333333e-07, + "loss": 0.0081, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2071 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.0625, + "epoch": 0.27626666666666666, + "grad_norm": 13.47214412689209, + "kl": 0.44384765625, + "learning_rate": 8.618666666666667e-07, + "loss": 0.0177, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2072 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.375, + "epoch": 0.2764, + "grad_norm": 0.4252094030380249, + "kl": 0.240234375, + "learning_rate": 8.618e-07, + "loss": 0.0096, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2073 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.625, + "epoch": 0.27653333333333335, + "grad_norm": 5.96547794342041, + "kl": 0.20703125, + "learning_rate": 8.617333333333333e-07, + "loss": 0.0083, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2074 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.625, + "epoch": 0.27666666666666667, + "grad_norm": 8.337780952453613, + "kl": 0.326171875, + "learning_rate": 8.616666666666666e-07, + "loss": 0.0131, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 2075 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.375, + "epoch": 0.2768, + "grad_norm": 5.138183116912842, + "kl": 0.22021484375, + "learning_rate": 8.616e-07, + "loss": 0.0088, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2076 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.5, + "epoch": 0.2769333333333333, + "grad_norm": 6.204951286315918, + "kl": 0.2978515625, + "learning_rate": 8.615333333333333e-07, + "loss": 0.0119, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2077 + }, + { + "clip_ratio": 0.0, + "completion_length": 48.25, + "epoch": 0.2770666666666667, + "grad_norm": 0.9224139451980591, + "kl": 0.2822265625, + "learning_rate": 8.614666666666666e-07, + "loss": 0.0113, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2078 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.0625, + "epoch": 0.2772, + "grad_norm": 9.973669052124023, + "kl": 0.251953125, + "learning_rate": 8.614e-07, + "loss": 0.0101, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2079 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.5625, + "epoch": 0.2773333333333333, + "grad_norm": 5.58294677734375, + "kl": 0.12109375, + "learning_rate": 8.613333333333332e-07, + "loss": 0.0048, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2080 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.1875, + "epoch": 0.27746666666666664, + "grad_norm": 7.363045692443848, + "kl": 0.18701171875, + "learning_rate": 8.612666666666666e-07, + "loss": 0.0075, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2081 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.25, + "epoch": 0.2776, + "grad_norm": 6.778614521026611, + "kl": 0.20751953125, + "learning_rate": 8.611999999999999e-07, + "loss": 0.0083, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 2082 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.5, + "epoch": 0.27773333333333333, + "grad_norm": 9.018392562866211, + "kl": 0.16162109375, + "learning_rate": 8.611333333333333e-07, + "loss": 0.0064, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2083 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.3125, + "epoch": 0.27786666666666665, + "grad_norm": 7.137028217315674, + "kl": 0.259765625, + "learning_rate": 8.610666666666666e-07, + "loss": 0.0104, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 2084 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.8125, + "epoch": 0.278, + "grad_norm": 10.636902809143066, + "kl": 0.6806640625, + "learning_rate": 8.61e-07, + "loss": 0.0272, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2085 + }, + { + "clip_ratio": 0.0, + "completion_length": 47.9375, + "epoch": 0.27813333333333334, + "grad_norm": 7.863849639892578, + "kl": 0.40234375, + "learning_rate": 8.609333333333333e-07, + "loss": 0.0161, + "reward": 1.0625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 2086 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5625, + "epoch": 0.27826666666666666, + "grad_norm": 12.02219295501709, + "kl": 0.453125, + "learning_rate": 8.608666666666666e-07, + "loss": 0.0181, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2087 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.75, + "epoch": 0.2784, + "grad_norm": 0.5118366479873657, + "kl": 0.2529296875, + "learning_rate": 8.608e-07, + "loss": 0.0101, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2088 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.3125, + "epoch": 0.27853333333333335, + "grad_norm": 13.042466163635254, + "kl": 0.360107421875, + "learning_rate": 8.607333333333333e-07, + "loss": 0.0144, + "reward": 1.5, + "reward_std": 0.7440237998962402, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 2089 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.0625, + "epoch": 0.2786666666666667, + "grad_norm": 7.598360538482666, + "kl": 0.486328125, + "learning_rate": 8.606666666666667e-07, + "loss": 0.0194, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 2090 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.75, + "epoch": 0.2788, + "grad_norm": 9.037683486938477, + "kl": 0.4560546875, + "learning_rate": 8.606e-07, + "loss": 0.0183, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2091 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.8125, + "epoch": 0.2789333333333333, + "grad_norm": 17.73073387145996, + "kl": 0.25048828125, + "learning_rate": 8.605333333333334e-07, + "loss": 0.01, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2092 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.6875, + "epoch": 0.2790666666666667, + "grad_norm": 5.927797317504883, + "kl": 0.22900390625, + "learning_rate": 8.604666666666667e-07, + "loss": 0.0091, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 2093 + }, + { + "clip_ratio": 0.0, + "completion_length": 46.4375, + "epoch": 0.2792, + "grad_norm": 6.048162937164307, + "kl": 0.29931640625, + "learning_rate": 8.604000000000001e-07, + "loss": 0.012, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2094 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.375, + "epoch": 0.2793333333333333, + "grad_norm": 12.000776290893555, + "kl": 0.3798828125, + "learning_rate": 8.603333333333332e-07, + "loss": 0.0152, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2095 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.0, + "epoch": 0.27946666666666664, + "grad_norm": 20.347026824951172, + "kl": 0.6103515625, + "learning_rate": 8.602666666666665e-07, + "loss": 0.0245, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2096 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.5625, + "epoch": 0.2796, + "grad_norm": 4.679325103759766, + "kl": 0.16162109375, + "learning_rate": 8.601999999999999e-07, + "loss": 0.0065, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 2097 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.9375, + "epoch": 0.27973333333333333, + "grad_norm": 11.012667655944824, + "kl": 0.23681640625, + "learning_rate": 8.601333333333332e-07, + "loss": 0.0095, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2098 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.3125, + "epoch": 0.27986666666666665, + "grad_norm": 10.164857864379883, + "kl": 0.37890625, + "learning_rate": 8.600666666666666e-07, + "loss": 0.0152, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 2099 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.1875, + "epoch": 0.28, + "grad_norm": 2.8829407691955566, + "kl": 0.314453125, + "learning_rate": 8.599999999999999e-07, + "loss": 0.0126, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 2100 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.625, + "epoch": 0.28013333333333335, + "grad_norm": 105.28815460205078, + "kl": 0.212890625, + "learning_rate": 8.599333333333333e-07, + "loss": 0.0085, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2101 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.9375, + "epoch": 0.28026666666666666, + "grad_norm": 10.113919258117676, + "kl": 0.37841796875, + "learning_rate": 8.598666666666666e-07, + "loss": 0.0151, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2102 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.5, + "epoch": 0.2804, + "grad_norm": 7.692734241485596, + "kl": 0.2890625, + "learning_rate": 8.598e-07, + "loss": 0.0115, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2103 + }, + { + "clip_ratio": 0.0, + "completion_length": 44.3125, + "epoch": 0.28053333333333336, + "grad_norm": 50.675445556640625, + "kl": 0.28125, + "learning_rate": 8.597333333333333e-07, + "loss": 0.0112, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2104 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.5625, + "epoch": 0.2806666666666667, + "grad_norm": 9.399991989135742, + "kl": 0.26953125, + "learning_rate": 8.596666666666667e-07, + "loss": 0.0108, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2105 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.875, + "epoch": 0.2808, + "grad_norm": 10.436001777648926, + "kl": 0.2841796875, + "learning_rate": 8.596e-07, + "loss": 0.0114, + "reward": 1.5, + "reward_std": 0.8408745229244232, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.8125, + "step": 2106 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.4375, + "epoch": 0.2809333333333333, + "grad_norm": 7.782219886779785, + "kl": 0.26220703125, + "learning_rate": 8.595333333333333e-07, + "loss": 0.0105, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2107 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.0625, + "epoch": 0.2810666666666667, + "grad_norm": 15.292816162109375, + "kl": 0.5283203125, + "learning_rate": 8.594666666666667e-07, + "loss": 0.0211, + "reward": 1.0625, + "reward_std": 0.6983994543552399, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.8125, + "step": 2108 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.9375, + "epoch": 0.2812, + "grad_norm": 5.21185827255249, + "kl": 0.18505859375, + "learning_rate": 8.594e-07, + "loss": 0.0074, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2109 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.375, + "epoch": 0.2813333333333333, + "grad_norm": 6.957972049713135, + "kl": 0.18115234375, + "learning_rate": 8.593333333333333e-07, + "loss": 0.0072, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2110 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.3125, + "epoch": 0.28146666666666664, + "grad_norm": 9.31915283203125, + "kl": 0.2490234375, + "learning_rate": 8.592666666666666e-07, + "loss": 0.0099, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 2111 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.125, + "epoch": 0.2816, + "grad_norm": 8.241409301757812, + "kl": 0.35595703125, + "learning_rate": 8.592e-07, + "loss": 0.0142, + "reward": 1.3125, + "reward_std": 0.5876962244510651, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 2112 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.5625, + "epoch": 0.28173333333333334, + "grad_norm": 20.967918395996094, + "kl": 0.42822265625, + "learning_rate": 8.591333333333333e-07, + "loss": 0.0171, + "reward": 1.4375, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2113 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.4375, + "epoch": 0.28186666666666665, + "grad_norm": 16.224546432495117, + "kl": 0.5205078125, + "learning_rate": 8.590666666666667e-07, + "loss": 0.0208, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2114 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.0625, + "epoch": 0.282, + "grad_norm": 10.432475090026855, + "kl": 0.21630859375, + "learning_rate": 8.59e-07, + "loss": 0.0086, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2115 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.8125, + "epoch": 0.28213333333333335, + "grad_norm": 45.77690505981445, + "kl": 0.3779296875, + "learning_rate": 8.589333333333332e-07, + "loss": 0.0151, + "reward": 1.0625, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.875, + "step": 2116 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.25, + "epoch": 0.28226666666666667, + "grad_norm": 9.506906509399414, + "kl": 0.3515625, + "learning_rate": 8.588666666666666e-07, + "loss": 0.0141, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2117 + }, + { + "clip_ratio": 0.0, + "completion_length": 42.6875, + "epoch": 0.2824, + "grad_norm": 11.036328315734863, + "kl": 0.36962890625, + "learning_rate": 8.587999999999999e-07, + "loss": 0.0148, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2118 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.625, + "epoch": 0.28253333333333336, + "grad_norm": 12.185890197753906, + "kl": 0.27587890625, + "learning_rate": 8.587333333333333e-07, + "loss": 0.011, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2119 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.9375, + "epoch": 0.2826666666666667, + "grad_norm": 13.541666984558105, + "kl": 0.33056640625, + "learning_rate": 8.586666666666666e-07, + "loss": 0.0132, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2120 + }, + { + "clip_ratio": 0.0, + "completion_length": 53.0625, + "epoch": 0.2828, + "grad_norm": 58.9971809387207, + "kl": 0.2490234375, + "learning_rate": 8.586e-07, + "loss": 0.01, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2121 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.0625, + "epoch": 0.2829333333333333, + "grad_norm": 8.123245239257812, + "kl": 0.1806640625, + "learning_rate": 8.585333333333333e-07, + "loss": 0.0072, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2122 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.5, + "epoch": 0.2830666666666667, + "grad_norm": 11.586851119995117, + "kl": 0.4150390625, + "learning_rate": 8.584666666666667e-07, + "loss": 0.0166, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2123 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.375, + "epoch": 0.2832, + "grad_norm": 7.511420249938965, + "kl": 0.23583984375, + "learning_rate": 8.584e-07, + "loss": 0.0094, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2124 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.6875, + "epoch": 0.2833333333333333, + "grad_norm": 7.5502028465271, + "kl": 0.17578125, + "learning_rate": 8.583333333333332e-07, + "loss": 0.007, + "reward": 1.4375, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2125 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.8125, + "epoch": 0.28346666666666664, + "grad_norm": 7.907275676727295, + "kl": 0.2451171875, + "learning_rate": 8.582666666666666e-07, + "loss": 0.0098, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2126 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.9375, + "epoch": 0.2836, + "grad_norm": 8.199926376342773, + "kl": 0.294921875, + "learning_rate": 8.581999999999999e-07, + "loss": 0.0118, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 2127 + }, + { + "clip_ratio": 0.0, + "completion_length": 46.125, + "epoch": 0.28373333333333334, + "grad_norm": 11.41871452331543, + "kl": 0.6337890625, + "learning_rate": 8.581333333333333e-07, + "loss": 0.0254, + "reward": 1.25, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 2128 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.625, + "epoch": 0.28386666666666666, + "grad_norm": 7.784367561340332, + "kl": 0.3251953125, + "learning_rate": 8.580666666666666e-07, + "loss": 0.013, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2129 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.8125, + "epoch": 0.284, + "grad_norm": 5.635977745056152, + "kl": 0.4072265625, + "learning_rate": 8.58e-07, + "loss": 0.0163, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 2130 + }, + { + "clip_ratio": 0.0, + "completion_length": 43.9375, + "epoch": 0.28413333333333335, + "grad_norm": 7.286763668060303, + "kl": 0.2734375, + "learning_rate": 8.579333333333333e-07, + "loss": 0.0109, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2131 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.0625, + "epoch": 0.28426666666666667, + "grad_norm": 12.623493194580078, + "kl": 0.3935546875, + "learning_rate": 8.578666666666667e-07, + "loss": 0.0158, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2132 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.3125, + "epoch": 0.2844, + "grad_norm": 12.199237823486328, + "kl": 0.513671875, + "learning_rate": 8.578e-07, + "loss": 0.0206, + "reward": 1.5, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 2133 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.5625, + "epoch": 0.28453333333333336, + "grad_norm": 11.801044464111328, + "kl": 0.498046875, + "learning_rate": 8.577333333333333e-07, + "loss": 0.02, + "reward": 1.1875, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 2134 + }, + { + "clip_ratio": 0.0, + "completion_length": 42.5625, + "epoch": 0.2846666666666667, + "grad_norm": 19.781169891357422, + "kl": 0.423828125, + "learning_rate": 8.576666666666667e-07, + "loss": 0.0169, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2135 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.8125, + "epoch": 0.2848, + "grad_norm": 6.577229976654053, + "kl": 0.52734375, + "learning_rate": 8.576e-07, + "loss": 0.021, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2136 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.4375, + "epoch": 0.2849333333333333, + "grad_norm": 18.387250900268555, + "kl": 0.490234375, + "learning_rate": 8.575333333333333e-07, + "loss": 0.0196, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2137 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.6875, + "epoch": 0.2850666666666667, + "grad_norm": 10.467880249023438, + "kl": 0.611328125, + "learning_rate": 8.574666666666666e-07, + "loss": 0.0243, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2138 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.1875, + "epoch": 0.2852, + "grad_norm": 10.193755149841309, + "kl": 0.27392578125, + "learning_rate": 8.574e-07, + "loss": 0.011, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2139 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.1875, + "epoch": 0.2853333333333333, + "grad_norm": 12.94786262512207, + "kl": 0.76171875, + "learning_rate": 8.573333333333332e-07, + "loss": 0.0304, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2140 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.8125, + "epoch": 0.28546666666666665, + "grad_norm": 9.917055130004883, + "kl": 0.17919921875, + "learning_rate": 8.572666666666666e-07, + "loss": 0.0072, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 2141 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.1875, + "epoch": 0.2856, + "grad_norm": 13.062222480773926, + "kl": 0.4208984375, + "learning_rate": 8.571999999999999e-07, + "loss": 0.0168, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 2142 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.1875, + "epoch": 0.28573333333333334, + "grad_norm": 9.943913459777832, + "kl": 0.4990234375, + "learning_rate": 8.571333333333332e-07, + "loss": 0.0199, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2143 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.625, + "epoch": 0.28586666666666666, + "grad_norm": 36.34223175048828, + "kl": 0.36376953125, + "learning_rate": 8.570666666666666e-07, + "loss": 0.0145, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 2144 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.0, + "epoch": 0.286, + "grad_norm": 49.6933479309082, + "kl": 0.35888671875, + "learning_rate": 8.569999999999999e-07, + "loss": 0.0144, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2145 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.875, + "epoch": 0.28613333333333335, + "grad_norm": 9.711081504821777, + "kl": 0.3515625, + "learning_rate": 8.569333333333333e-07, + "loss": 0.0141, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2146 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.8125, + "epoch": 0.28626666666666667, + "grad_norm": 11.581778526306152, + "kl": 0.27734375, + "learning_rate": 8.568666666666666e-07, + "loss": 0.0111, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2147 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.8125, + "epoch": 0.2864, + "grad_norm": 8.85354995727539, + "kl": 0.2998046875, + "learning_rate": 8.568e-07, + "loss": 0.012, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2148 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.0, + "epoch": 0.2865333333333333, + "grad_norm": 9.070054054260254, + "kl": 0.384765625, + "learning_rate": 8.567333333333333e-07, + "loss": 0.0154, + "reward": 1.6875, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 2149 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.375, + "epoch": 0.2866666666666667, + "grad_norm": 9.230780601501465, + "kl": 0.2890625, + "learning_rate": 8.566666666666667e-07, + "loss": 0.0116, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2150 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.8125, + "epoch": 0.2868, + "grad_norm": 9.756985664367676, + "kl": 0.666015625, + "learning_rate": 8.566e-07, + "loss": 0.0266, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2151 + }, + { + "clip_ratio": 0.0, + "completion_length": 43.3125, + "epoch": 0.2869333333333333, + "grad_norm": 9.752785682678223, + "kl": 0.5830078125, + "learning_rate": 8.565333333333334e-07, + "loss": 0.0234, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2152 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.5, + "epoch": 0.2870666666666667, + "grad_norm": 8.799137115478516, + "kl": 0.423828125, + "learning_rate": 8.564666666666667e-07, + "loss": 0.017, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2153 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.0625, + "epoch": 0.2872, + "grad_norm": 11.42215347290039, + "kl": 0.259765625, + "learning_rate": 8.564e-07, + "loss": 0.0104, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2154 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.0625, + "epoch": 0.28733333333333333, + "grad_norm": 12.152588844299316, + "kl": 0.5078125, + "learning_rate": 8.563333333333333e-07, + "loss": 0.0203, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2155 + }, + { + "clip_ratio": 0.0, + "completion_length": 47.0625, + "epoch": 0.28746666666666665, + "grad_norm": 125.24358367919922, + "kl": 0.5810546875, + "learning_rate": 8.562666666666666e-07, + "loss": 0.0232, + "reward": 1.4375, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2156 + }, + { + "clip_ratio": 0.0, + "completion_length": 25.3125, + "epoch": 0.2876, + "grad_norm": 13.259791374206543, + "kl": 0.771484375, + "learning_rate": 8.562e-07, + "loss": 0.0308, + "reward": 1.375, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2157 + }, + { + "clip_ratio": 0.0, + "completion_length": 31.1875, + "epoch": 0.28773333333333334, + "grad_norm": 12.234010696411133, + "kl": 0.51953125, + "learning_rate": 8.561333333333332e-07, + "loss": 0.0208, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2158 + }, + { + "clip_ratio": 0.0, + "completion_length": 30.375, + "epoch": 0.28786666666666666, + "grad_norm": 9.33641529083252, + "kl": 0.388671875, + "learning_rate": 8.560666666666666e-07, + "loss": 0.0155, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2159 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.875, + "epoch": 0.288, + "grad_norm": 8.676507949829102, + "kl": 0.59375, + "learning_rate": 8.559999999999999e-07, + "loss": 0.0237, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2160 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.25, + "epoch": 0.28813333333333335, + "grad_norm": 12.718012809753418, + "kl": 0.67578125, + "learning_rate": 8.559333333333333e-07, + "loss": 0.027, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2161 + }, + { + "clip_ratio": 0.0, + "completion_length": 47.3125, + "epoch": 0.28826666666666667, + "grad_norm": 10.595438003540039, + "kl": 0.53515625, + "learning_rate": 8.558666666666666e-07, + "loss": 0.0214, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 2162 + }, + { + "clip_ratio": 0.0, + "completion_length": 20.6875, + "epoch": 0.2884, + "grad_norm": 0.7166391015052795, + "kl": 0.5986328125, + "learning_rate": 8.557999999999999e-07, + "loss": 0.0239, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2163 + }, + { + "clip_ratio": 0.0, + "completion_length": 28.5, + "epoch": 0.2885333333333333, + "grad_norm": 17.922754287719727, + "kl": 1.291015625, + "learning_rate": 8.557333333333333e-07, + "loss": 0.0516, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.875, + "step": 2164 + }, + { + "clip_ratio": 0.0, + "completion_length": 29.875, + "epoch": 0.2886666666666667, + "grad_norm": 7.597209453582764, + "kl": 0.5966796875, + "learning_rate": 8.556666666666666e-07, + "loss": 0.0239, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2165 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.75, + "epoch": 0.2888, + "grad_norm": 11.554757118225098, + "kl": 0.5400390625, + "learning_rate": 8.556e-07, + "loss": 0.0216, + "reward": 1.3125, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 2166 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.6875, + "epoch": 0.2889333333333333, + "grad_norm": 9.88530158996582, + "kl": 0.546875, + "learning_rate": 8.555333333333333e-07, + "loss": 0.0219, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2167 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.6875, + "epoch": 0.2890666666666667, + "grad_norm": 6.8318400382995605, + "kl": 0.5419921875, + "learning_rate": 8.554666666666667e-07, + "loss": 0.0216, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 2168 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.5, + "epoch": 0.2892, + "grad_norm": 13.85969066619873, + "kl": 0.580078125, + "learning_rate": 8.554e-07, + "loss": 0.0232, + "reward": 1.125, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 2169 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.625, + "epoch": 0.28933333333333333, + "grad_norm": 15.345696449279785, + "kl": 0.541015625, + "learning_rate": 8.553333333333333e-07, + "loss": 0.0216, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2170 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.25, + "epoch": 0.28946666666666665, + "grad_norm": 9.655131340026855, + "kl": 0.595703125, + "learning_rate": 8.552666666666666e-07, + "loss": 0.0238, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2171 + }, + { + "clip_ratio": 0.0, + "completion_length": 48.125, + "epoch": 0.2896, + "grad_norm": 11.699614524841309, + "kl": 0.3740234375, + "learning_rate": 8.551999999999999e-07, + "loss": 0.0149, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2172 + }, + { + "clip_ratio": 0.0, + "completion_length": 29.5625, + "epoch": 0.28973333333333334, + "grad_norm": 13.177685737609863, + "kl": 0.75, + "learning_rate": 8.551333333333333e-07, + "loss": 0.03, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 2173 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.1875, + "epoch": 0.28986666666666666, + "grad_norm": 10.709954261779785, + "kl": 0.5078125, + "learning_rate": 8.550666666666666e-07, + "loss": 0.0203, + "reward": 1.4375, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2174 + }, + { + "clip_ratio": 0.0, + "completion_length": 32.8125, + "epoch": 0.29, + "grad_norm": 24.195377349853516, + "kl": 0.701171875, + "learning_rate": 8.55e-07, + "loss": 0.0281, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2175 + }, + { + "clip_ratio": 0.0, + "completion_length": 48.0, + "epoch": 0.29013333333333335, + "grad_norm": 15.4932222366333, + "kl": 0.4365234375, + "learning_rate": 8.549333333333333e-07, + "loss": 0.0174, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2176 + }, + { + "clip_ratio": 0.0, + "completion_length": 46.625, + "epoch": 0.2902666666666667, + "grad_norm": 6.8845319747924805, + "kl": 0.3427734375, + "learning_rate": 8.548666666666667e-07, + "loss": 0.0137, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 2177 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.375, + "epoch": 0.2904, + "grad_norm": 10.045220375061035, + "kl": 0.27783203125, + "learning_rate": 8.548e-07, + "loss": 0.0111, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2178 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.6875, + "epoch": 0.2905333333333333, + "grad_norm": 0.9048585891723633, + "kl": 0.52734375, + "learning_rate": 8.547333333333334e-07, + "loss": 0.0211, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2179 + }, + { + "clip_ratio": 0.0, + "completion_length": 47.25, + "epoch": 0.2906666666666667, + "grad_norm": 7.273117542266846, + "kl": 0.36328125, + "learning_rate": 8.546666666666666e-07, + "loss": 0.0146, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2180 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.75, + "epoch": 0.2908, + "grad_norm": 14.89930534362793, + "kl": 0.501953125, + "learning_rate": 8.545999999999999e-07, + "loss": 0.02, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2181 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.6875, + "epoch": 0.2909333333333333, + "grad_norm": 11.219034194946289, + "kl": 0.5302734375, + "learning_rate": 8.545333333333333e-07, + "loss": 0.0212, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2182 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.3125, + "epoch": 0.29106666666666664, + "grad_norm": 10.347867012023926, + "kl": 0.443359375, + "learning_rate": 8.544666666666666e-07, + "loss": 0.0178, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2183 + }, + { + "clip_ratio": 0.0, + "completion_length": 46.625, + "epoch": 0.2912, + "grad_norm": 218.58547973632812, + "kl": 0.4814453125, + "learning_rate": 8.544e-07, + "loss": 0.0193, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2184 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.5625, + "epoch": 0.29133333333333333, + "grad_norm": 10.226367950439453, + "kl": 0.36376953125, + "learning_rate": 8.543333333333332e-07, + "loss": 0.0145, + "reward": 1.4375, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2185 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.5625, + "epoch": 0.29146666666666665, + "grad_norm": 10.901331901550293, + "kl": 0.29443359375, + "learning_rate": 8.542666666666666e-07, + "loss": 0.0118, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2186 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.1875, + "epoch": 0.2916, + "grad_norm": 6.217565536499023, + "kl": 0.306640625, + "learning_rate": 8.541999999999999e-07, + "loss": 0.0123, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2187 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.75, + "epoch": 0.29173333333333334, + "grad_norm": 10.604538917541504, + "kl": 0.28173828125, + "learning_rate": 8.541333333333333e-07, + "loss": 0.0113, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2188 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.75, + "epoch": 0.29186666666666666, + "grad_norm": 8.718247413635254, + "kl": 0.2666015625, + "learning_rate": 8.540666666666666e-07, + "loss": 0.0107, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2189 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.5625, + "epoch": 0.292, + "grad_norm": 130.42474365234375, + "kl": 0.4140625, + "learning_rate": 8.539999999999999e-07, + "loss": 0.0166, + "reward": 1.125, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 2190 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.75, + "epoch": 0.29213333333333336, + "grad_norm": 13.130376815795898, + "kl": 0.3828125, + "learning_rate": 8.539333333333333e-07, + "loss": 0.0154, + "reward": 1.25, + "reward_std": 0.579209566116333, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 2191 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.5, + "epoch": 0.2922666666666667, + "grad_norm": 7.445771217346191, + "kl": 0.2763671875, + "learning_rate": 8.538666666666666e-07, + "loss": 0.011, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2192 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.0625, + "epoch": 0.2924, + "grad_norm": 5.858938694000244, + "kl": 0.38671875, + "learning_rate": 8.538e-07, + "loss": 0.0155, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2193 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.5625, + "epoch": 0.2925333333333333, + "grad_norm": 7.898849964141846, + "kl": 0.2939453125, + "learning_rate": 8.537333333333333e-07, + "loss": 0.0117, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2194 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.8125, + "epoch": 0.2926666666666667, + "grad_norm": 8.83077621459961, + "kl": 0.2177734375, + "learning_rate": 8.536666666666667e-07, + "loss": 0.0087, + "reward": 1.4375, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2195 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.9375, + "epoch": 0.2928, + "grad_norm": 10.13149356842041, + "kl": 0.21484375, + "learning_rate": 8.536e-07, + "loss": 0.0086, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2196 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.0, + "epoch": 0.2929333333333333, + "grad_norm": 8.611364364624023, + "kl": 0.275390625, + "learning_rate": 8.535333333333334e-07, + "loss": 0.011, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2197 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.4375, + "epoch": 0.29306666666666664, + "grad_norm": 9.671414375305176, + "kl": 0.2626953125, + "learning_rate": 8.534666666666667e-07, + "loss": 0.0105, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2198 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.0625, + "epoch": 0.2932, + "grad_norm": 19.310800552368164, + "kl": 0.3203125, + "learning_rate": 8.534000000000001e-07, + "loss": 0.0128, + "reward": 1.25, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 2199 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.9375, + "epoch": 0.29333333333333333, + "grad_norm": 9.169354438781738, + "kl": 0.26953125, + "learning_rate": 8.533333333333334e-07, + "loss": 0.0108, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2200 + }, + { + "clip_ratio": 0.0, + "completion_length": 52.375, + "epoch": 0.29346666666666665, + "grad_norm": 8.129545211791992, + "kl": 0.3623046875, + "learning_rate": 8.532666666666665e-07, + "loss": 0.0145, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2201 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.6875, + "epoch": 0.2936, + "grad_norm": 11.594199180603027, + "kl": 0.25634765625, + "learning_rate": 8.531999999999999e-07, + "loss": 0.0103, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2202 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.6875, + "epoch": 0.29373333333333335, + "grad_norm": 8.072861671447754, + "kl": 0.26806640625, + "learning_rate": 8.531333333333332e-07, + "loss": 0.0107, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2203 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.75, + "epoch": 0.29386666666666666, + "grad_norm": 6.612603187561035, + "kl": 0.28076171875, + "learning_rate": 8.530666666666666e-07, + "loss": 0.0112, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2204 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.625, + "epoch": 0.294, + "grad_norm": 9.736358642578125, + "kl": 0.2548828125, + "learning_rate": 8.529999999999999e-07, + "loss": 0.0102, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2205 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.875, + "epoch": 0.29413333333333336, + "grad_norm": 0.8548669219017029, + "kl": 0.291015625, + "learning_rate": 8.529333333333333e-07, + "loss": 0.0116, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2206 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.8125, + "epoch": 0.2942666666666667, + "grad_norm": 11.944281578063965, + "kl": 0.3671875, + "learning_rate": 8.528666666666666e-07, + "loss": 0.0147, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2207 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.1875, + "epoch": 0.2944, + "grad_norm": 7.176209926605225, + "kl": 0.23974609375, + "learning_rate": 8.528e-07, + "loss": 0.0096, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2208 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.625, + "epoch": 0.2945333333333333, + "grad_norm": 11.303689002990723, + "kl": 0.25390625, + "learning_rate": 8.527333333333333e-07, + "loss": 0.0102, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2209 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.375, + "epoch": 0.2946666666666667, + "grad_norm": 9.957043647766113, + "kl": 0.27685546875, + "learning_rate": 8.526666666666666e-07, + "loss": 0.0111, + "reward": 0.9375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.8125, + "step": 2210 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.875, + "epoch": 0.2948, + "grad_norm": 10.198081016540527, + "kl": 0.322265625, + "learning_rate": 8.526e-07, + "loss": 0.0129, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2211 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.1875, + "epoch": 0.2949333333333333, + "grad_norm": 10.23162841796875, + "kl": 0.27783203125, + "learning_rate": 8.525333333333333e-07, + "loss": 0.0111, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2212 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.5, + "epoch": 0.29506666666666664, + "grad_norm": 8.213607788085938, + "kl": 0.22607421875, + "learning_rate": 8.524666666666667e-07, + "loss": 0.009, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2213 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.0, + "epoch": 0.2952, + "grad_norm": 5.45608377456665, + "kl": 0.21533203125, + "learning_rate": 8.524e-07, + "loss": 0.0086, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.875, + "step": 2214 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.0, + "epoch": 0.29533333333333334, + "grad_norm": 6.2143473625183105, + "kl": 0.2880859375, + "learning_rate": 8.523333333333334e-07, + "loss": 0.0115, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 2215 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.8125, + "epoch": 0.29546666666666666, + "grad_norm": 15.869304656982422, + "kl": 0.29345703125, + "learning_rate": 8.522666666666666e-07, + "loss": 0.0117, + "reward": 1.3125, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 2216 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.5, + "epoch": 0.2956, + "grad_norm": 10.222678184509277, + "kl": 0.24658203125, + "learning_rate": 8.522e-07, + "loss": 0.0098, + "reward": 1.5, + "reward_std": 0.6452257037162781, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2217 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.875, + "epoch": 0.29573333333333335, + "grad_norm": 8.58787727355957, + "kl": 0.24072265625, + "learning_rate": 8.521333333333333e-07, + "loss": 0.0096, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2218 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.1875, + "epoch": 0.29586666666666667, + "grad_norm": 8.65876293182373, + "kl": 0.3583984375, + "learning_rate": 8.520666666666666e-07, + "loss": 0.0144, + "reward": 1.1875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 2219 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.8125, + "epoch": 0.296, + "grad_norm": 55.83784866333008, + "kl": 0.2548828125, + "learning_rate": 8.52e-07, + "loss": 0.0102, + "reward": 1.375, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2220 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.3125, + "epoch": 0.29613333333333336, + "grad_norm": 5.655921459197998, + "kl": 0.2490234375, + "learning_rate": 8.519333333333332e-07, + "loss": 0.0099, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2221 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.9375, + "epoch": 0.2962666666666667, + "grad_norm": 10.477180480957031, + "kl": 0.4189453125, + "learning_rate": 8.518666666666666e-07, + "loss": 0.0168, + "reward": 1.0, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.875, + "step": 2222 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.6875, + "epoch": 0.2964, + "grad_norm": 6.134000301361084, + "kl": 0.19287109375, + "learning_rate": 8.517999999999999e-07, + "loss": 0.0077, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2223 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.1875, + "epoch": 0.2965333333333333, + "grad_norm": 9.530046463012695, + "kl": 0.2421875, + "learning_rate": 8.517333333333333e-07, + "loss": 0.0097, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 2224 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.25, + "epoch": 0.2966666666666667, + "grad_norm": 0.4319379925727844, + "kl": 0.20068359375, + "learning_rate": 8.516666666666666e-07, + "loss": 0.008, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 2225 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.5625, + "epoch": 0.2968, + "grad_norm": 10.525705337524414, + "kl": 0.6865234375, + "learning_rate": 8.516e-07, + "loss": 0.0275, + "reward": 1.1875, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 2226 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.125, + "epoch": 0.2969333333333333, + "grad_norm": 38.68130874633789, + "kl": 0.24853515625, + "learning_rate": 8.515333333333333e-07, + "loss": 0.01, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2227 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.125, + "epoch": 0.29706666666666665, + "grad_norm": 5.878066539764404, + "kl": 0.20068359375, + "learning_rate": 8.514666666666666e-07, + "loss": 0.008, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 2228 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.0, + "epoch": 0.2972, + "grad_norm": 14.70491886138916, + "kl": 0.18994140625, + "learning_rate": 8.514e-07, + "loss": 0.0076, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 2229 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.25, + "epoch": 0.29733333333333334, + "grad_norm": 5.344730377197266, + "kl": 0.259765625, + "learning_rate": 8.513333333333333e-07, + "loss": 0.0104, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2230 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.25, + "epoch": 0.29746666666666666, + "grad_norm": 4.7090582847595215, + "kl": 0.23974609375, + "learning_rate": 8.512666666666666e-07, + "loss": 0.0096, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2231 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.3125, + "epoch": 0.2976, + "grad_norm": 11.1129789352417, + "kl": 0.36474609375, + "learning_rate": 8.511999999999999e-07, + "loss": 0.0146, + "reward": 0.9375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.875, + "step": 2232 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.6875, + "epoch": 0.29773333333333335, + "grad_norm": 6.879687786102295, + "kl": 0.20849609375, + "learning_rate": 8.511333333333333e-07, + "loss": 0.0084, + "reward": 1.0, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.875, + "step": 2233 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.375, + "epoch": 0.29786666666666667, + "grad_norm": 10.838763236999512, + "kl": 0.22265625, + "learning_rate": 8.510666666666666e-07, + "loss": 0.0089, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2234 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.3125, + "epoch": 0.298, + "grad_norm": 166.4364013671875, + "kl": 0.3720703125, + "learning_rate": 8.51e-07, + "loss": 0.0149, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2235 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.8125, + "epoch": 0.2981333333333333, + "grad_norm": 52.84848403930664, + "kl": 0.3720703125, + "learning_rate": 8.509333333333333e-07, + "loss": 0.0149, + "reward": 1.0, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.875, + "step": 2236 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.6875, + "epoch": 0.2982666666666667, + "grad_norm": 7.484732151031494, + "kl": 0.1845703125, + "learning_rate": 8.508666666666666e-07, + "loss": 0.0074, + "reward": 1.625, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 2237 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.625, + "epoch": 0.2984, + "grad_norm": 8.17243480682373, + "kl": 0.2724609375, + "learning_rate": 8.508e-07, + "loss": 0.0109, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2238 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.0625, + "epoch": 0.2985333333333333, + "grad_norm": 5.95106840133667, + "kl": 0.263671875, + "learning_rate": 8.507333333333333e-07, + "loss": 0.0105, + "reward": 1.5, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 2239 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.625, + "epoch": 0.2986666666666667, + "grad_norm": 11.20078182220459, + "kl": 0.26513671875, + "learning_rate": 8.506666666666667e-07, + "loss": 0.0106, + "reward": 1.4375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.8125, + "step": 2240 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.6875, + "epoch": 0.2988, + "grad_norm": 7.641719341278076, + "kl": 0.18798828125, + "learning_rate": 8.506e-07, + "loss": 0.0075, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2241 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.5, + "epoch": 0.29893333333333333, + "grad_norm": 9.7957181930542, + "kl": 0.30615234375, + "learning_rate": 8.505333333333334e-07, + "loss": 0.0122, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 2242 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.5, + "epoch": 0.29906666666666665, + "grad_norm": 8.00571346282959, + "kl": 0.20703125, + "learning_rate": 8.504666666666666e-07, + "loss": 0.0083, + "reward": 1.5625, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 2243 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.25, + "epoch": 0.2992, + "grad_norm": 5.754045486450195, + "kl": 0.21435546875, + "learning_rate": 8.504e-07, + "loss": 0.0086, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.8125, + "step": 2244 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.125, + "epoch": 0.29933333333333334, + "grad_norm": 4.5362019538879395, + "kl": 0.22021484375, + "learning_rate": 8.503333333333333e-07, + "loss": 0.0088, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2245 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.0625, + "epoch": 0.29946666666666666, + "grad_norm": 0.3666824698448181, + "kl": 0.18798828125, + "learning_rate": 8.502666666666665e-07, + "loss": 0.0075, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2246 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.125, + "epoch": 0.2996, + "grad_norm": 9.429265975952148, + "kl": 0.26025390625, + "learning_rate": 8.501999999999999e-07, + "loss": 0.0104, + "reward": 1.125, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 2247 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.9375, + "epoch": 0.29973333333333335, + "grad_norm": 6.740275859832764, + "kl": 0.322265625, + "learning_rate": 8.501333333333332e-07, + "loss": 0.0129, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 2248 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.625, + "epoch": 0.29986666666666667, + "grad_norm": 6.147159099578857, + "kl": 0.19580078125, + "learning_rate": 8.500666666666666e-07, + "loss": 0.0078, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2249 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.375, + "epoch": 0.3, + "grad_norm": 8.018207550048828, + "kl": 0.205078125, + "learning_rate": 8.499999999999999e-07, + "loss": 0.0082, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2250 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.25, + "epoch": 0.3001333333333333, + "grad_norm": 4.089834690093994, + "kl": 0.3212890625, + "learning_rate": 8.499333333333333e-07, + "loss": 0.0129, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 2251 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.5625, + "epoch": 0.3002666666666667, + "grad_norm": 8.106940269470215, + "kl": 0.1953125, + "learning_rate": 8.498666666666666e-07, + "loss": 0.0078, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2252 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.125, + "epoch": 0.3004, + "grad_norm": 2.5496885776519775, + "kl": 0.19921875, + "learning_rate": 8.498e-07, + "loss": 0.008, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2253 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.8125, + "epoch": 0.3005333333333333, + "grad_norm": 9.375875473022461, + "kl": 0.18994140625, + "learning_rate": 8.497333333333333e-07, + "loss": 0.0076, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2254 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.875, + "epoch": 0.3006666666666667, + "grad_norm": 10.428975105285645, + "kl": 0.35546875, + "learning_rate": 8.496666666666667e-07, + "loss": 0.0142, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2255 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.9375, + "epoch": 0.3008, + "grad_norm": 7.16214656829834, + "kl": 0.2138671875, + "learning_rate": 8.496e-07, + "loss": 0.0086, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2256 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.625, + "epoch": 0.30093333333333333, + "grad_norm": 8.959321022033691, + "kl": 0.3359375, + "learning_rate": 8.495333333333333e-07, + "loss": 0.0134, + "reward": 1.5, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 2257 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.625, + "epoch": 0.30106666666666665, + "grad_norm": 2.6180622577667236, + "kl": 0.17236328125, + "learning_rate": 8.494666666666667e-07, + "loss": 0.0069, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 2258 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.5, + "epoch": 0.3012, + "grad_norm": 8.072476387023926, + "kl": 0.3330078125, + "learning_rate": 8.494e-07, + "loss": 0.0133, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2259 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.6875, + "epoch": 0.30133333333333334, + "grad_norm": 10.92542839050293, + "kl": 0.2255859375, + "learning_rate": 8.493333333333334e-07, + "loss": 0.009, + "reward": 1.125, + "reward_std": 0.6924468874931335, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.8125, + "step": 2260 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.25, + "epoch": 0.30146666666666666, + "grad_norm": 10.437232971191406, + "kl": 0.255859375, + "learning_rate": 8.492666666666666e-07, + "loss": 0.0103, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2261 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.6875, + "epoch": 0.3016, + "grad_norm": 5.311107158660889, + "kl": 0.24755859375, + "learning_rate": 8.492e-07, + "loss": 0.0099, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2262 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.125, + "epoch": 0.30173333333333335, + "grad_norm": 6.082234859466553, + "kl": 0.2802734375, + "learning_rate": 8.491333333333332e-07, + "loss": 0.0112, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2263 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.75, + "epoch": 0.30186666666666667, + "grad_norm": 8.159868240356445, + "kl": 0.23779296875, + "learning_rate": 8.490666666666666e-07, + "loss": 0.0095, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2264 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.1875, + "epoch": 0.302, + "grad_norm": 24.945415496826172, + "kl": 0.2392578125, + "learning_rate": 8.489999999999999e-07, + "loss": 0.0096, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2265 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.5, + "epoch": 0.3021333333333333, + "grad_norm": 5.937922954559326, + "kl": 0.275390625, + "learning_rate": 8.489333333333332e-07, + "loss": 0.011, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2266 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.4375, + "epoch": 0.3022666666666667, + "grad_norm": 6.709442138671875, + "kl": 0.283203125, + "learning_rate": 8.488666666666666e-07, + "loss": 0.0113, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2267 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.25, + "epoch": 0.3024, + "grad_norm": 10.453605651855469, + "kl": 0.44921875, + "learning_rate": 8.487999999999999e-07, + "loss": 0.0179, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.875, + "step": 2268 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.9375, + "epoch": 0.3025333333333333, + "grad_norm": 6.328691482543945, + "kl": 0.2578125, + "learning_rate": 8.487333333333333e-07, + "loss": 0.0103, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2269 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.5625, + "epoch": 0.30266666666666664, + "grad_norm": 13.679972648620605, + "kl": 0.2783203125, + "learning_rate": 8.486666666666666e-07, + "loss": 0.0111, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2270 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.25, + "epoch": 0.3028, + "grad_norm": 7.869081497192383, + "kl": 0.24951171875, + "learning_rate": 8.486e-07, + "loss": 0.01, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2271 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.0, + "epoch": 0.30293333333333333, + "grad_norm": 13.232443809509277, + "kl": 0.25439453125, + "learning_rate": 8.485333333333333e-07, + "loss": 0.0102, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2272 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.0625, + "epoch": 0.30306666666666665, + "grad_norm": 12.369253158569336, + "kl": 0.2880859375, + "learning_rate": 8.484666666666667e-07, + "loss": 0.0115, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2273 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.875, + "epoch": 0.3032, + "grad_norm": 0.5112888813018799, + "kl": 0.29345703125, + "learning_rate": 8.484e-07, + "loss": 0.0117, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2274 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.3125, + "epoch": 0.30333333333333334, + "grad_norm": 9.896714210510254, + "kl": 0.2353515625, + "learning_rate": 8.483333333333333e-07, + "loss": 0.0094, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 2275 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.1875, + "epoch": 0.30346666666666666, + "grad_norm": 8.90047836303711, + "kl": 0.24169921875, + "learning_rate": 8.482666666666666e-07, + "loss": 0.0096, + "reward": 1.5625, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 2276 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.5, + "epoch": 0.3036, + "grad_norm": 7.05854606628418, + "kl": 0.41015625, + "learning_rate": 8.481999999999999e-07, + "loss": 0.0164, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2277 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.3125, + "epoch": 0.30373333333333336, + "grad_norm": 10.002532958984375, + "kl": 0.2978515625, + "learning_rate": 8.481333333333333e-07, + "loss": 0.0119, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2278 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.9375, + "epoch": 0.3038666666666667, + "grad_norm": 4.6009721755981445, + "kl": 0.28662109375, + "learning_rate": 8.480666666666666e-07, + "loss": 0.0115, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2279 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.6875, + "epoch": 0.304, + "grad_norm": 8.488561630249023, + "kl": 0.2412109375, + "learning_rate": 8.48e-07, + "loss": 0.0096, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2280 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.8125, + "epoch": 0.3041333333333333, + "grad_norm": 9.94645881652832, + "kl": 0.24853515625, + "learning_rate": 8.479333333333333e-07, + "loss": 0.01, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2281 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.75, + "epoch": 0.3042666666666667, + "grad_norm": 18.893957138061523, + "kl": 0.70703125, + "learning_rate": 8.478666666666667e-07, + "loss": 0.0283, + "reward": 1.5, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 2282 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.0625, + "epoch": 0.3044, + "grad_norm": 9.116538047790527, + "kl": 0.2841796875, + "learning_rate": 8.478e-07, + "loss": 0.0114, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 2283 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.25, + "epoch": 0.3045333333333333, + "grad_norm": 8.896397590637207, + "kl": 0.36279296875, + "learning_rate": 8.477333333333332e-07, + "loss": 0.0145, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2284 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.3125, + "epoch": 0.30466666666666664, + "grad_norm": 7.866990089416504, + "kl": 0.3671875, + "learning_rate": 8.476666666666666e-07, + "loss": 0.0147, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2285 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.625, + "epoch": 0.3048, + "grad_norm": 9.75432014465332, + "kl": 0.30908203125, + "learning_rate": 8.475999999999999e-07, + "loss": 0.0124, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2286 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.75, + "epoch": 0.30493333333333333, + "grad_norm": 21.46416664123535, + "kl": 0.23828125, + "learning_rate": 8.475333333333333e-07, + "loss": 0.0095, + "reward": 1.4375, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2287 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.875, + "epoch": 0.30506666666666665, + "grad_norm": 7.608131408691406, + "kl": 0.24560546875, + "learning_rate": 8.474666666666666e-07, + "loss": 0.0098, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 2288 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.9375, + "epoch": 0.3052, + "grad_norm": 8.133071899414062, + "kl": 0.40234375, + "learning_rate": 8.474e-07, + "loss": 0.0161, + "reward": 1.25, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 2289 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.5, + "epoch": 0.30533333333333335, + "grad_norm": 9.313735008239746, + "kl": 0.2666015625, + "learning_rate": 8.473333333333333e-07, + "loss": 0.0107, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2290 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.125, + "epoch": 0.30546666666666666, + "grad_norm": 5.765848159790039, + "kl": 0.2568359375, + "learning_rate": 8.472666666666666e-07, + "loss": 0.0103, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 2291 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.9375, + "epoch": 0.3056, + "grad_norm": 8.139453887939453, + "kl": 0.396484375, + "learning_rate": 8.471999999999999e-07, + "loss": 0.0159, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2292 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.0625, + "epoch": 0.30573333333333336, + "grad_norm": 10.562070846557617, + "kl": 0.2880859375, + "learning_rate": 8.471333333333332e-07, + "loss": 0.0115, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2293 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.0625, + "epoch": 0.3058666666666667, + "grad_norm": 12.01034164428711, + "kl": 0.353515625, + "learning_rate": 8.470666666666666e-07, + "loss": 0.0141, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2294 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.375, + "epoch": 0.306, + "grad_norm": 9.597943305969238, + "kl": 0.25732421875, + "learning_rate": 8.469999999999999e-07, + "loss": 0.0103, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2295 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.125, + "epoch": 0.3061333333333333, + "grad_norm": 11.673965454101562, + "kl": 0.2978515625, + "learning_rate": 8.469333333333333e-07, + "loss": 0.0119, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2296 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.4375, + "epoch": 0.3062666666666667, + "grad_norm": 9.305295944213867, + "kl": 0.369140625, + "learning_rate": 8.468666666666666e-07, + "loss": 0.0148, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 2297 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.75, + "epoch": 0.3064, + "grad_norm": 50.9281005859375, + "kl": 0.53369140625, + "learning_rate": 8.468e-07, + "loss": 0.0213, + "reward": 1.625, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 2298 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.9375, + "epoch": 0.3065333333333333, + "grad_norm": 9.184114456176758, + "kl": 0.427734375, + "learning_rate": 8.467333333333333e-07, + "loss": 0.0171, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 2299 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.75, + "epoch": 0.30666666666666664, + "grad_norm": 11.193938255310059, + "kl": 0.2822265625, + "learning_rate": 8.466666666666667e-07, + "loss": 0.0113, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2300 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.0625, + "epoch": 0.3068, + "grad_norm": 6.140224456787109, + "kl": 0.26171875, + "learning_rate": 8.466e-07, + "loss": 0.0105, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2301 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.1875, + "epoch": 0.30693333333333334, + "grad_norm": 12.346854209899902, + "kl": 0.36328125, + "learning_rate": 8.465333333333334e-07, + "loss": 0.0145, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2302 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.6875, + "epoch": 0.30706666666666665, + "grad_norm": 10.87006664276123, + "kl": 0.3251953125, + "learning_rate": 8.464666666666667e-07, + "loss": 0.013, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2303 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.375, + "epoch": 0.3072, + "grad_norm": 5.65651273727417, + "kl": 0.3017578125, + "learning_rate": 8.464e-07, + "loss": 0.012, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 2304 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.5625, + "epoch": 0.30733333333333335, + "grad_norm": 8.997559547424316, + "kl": 0.2939453125, + "learning_rate": 8.463333333333334e-07, + "loss": 0.0118, + "reward": 1.625, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 2305 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.4375, + "epoch": 0.30746666666666667, + "grad_norm": 10.47632122039795, + "kl": 0.2802734375, + "learning_rate": 8.462666666666665e-07, + "loss": 0.0112, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2306 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.4375, + "epoch": 0.3076, + "grad_norm": 9.479472160339355, + "kl": 0.3916015625, + "learning_rate": 8.461999999999999e-07, + "loss": 0.0157, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2307 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.375, + "epoch": 0.30773333333333336, + "grad_norm": 8.251877784729004, + "kl": 0.26171875, + "learning_rate": 8.461333333333332e-07, + "loss": 0.0105, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2308 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.6875, + "epoch": 0.3078666666666667, + "grad_norm": 11.815702438354492, + "kl": 0.34619140625, + "learning_rate": 8.460666666666666e-07, + "loss": 0.0138, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2309 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.875, + "epoch": 0.308, + "grad_norm": 16.904611587524414, + "kl": 0.3251953125, + "learning_rate": 8.459999999999999e-07, + "loss": 0.013, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2310 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.9375, + "epoch": 0.3081333333333333, + "grad_norm": 8.815507888793945, + "kl": 0.37353515625, + "learning_rate": 8.459333333333333e-07, + "loss": 0.0149, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2311 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.3125, + "epoch": 0.3082666666666667, + "grad_norm": 11.25853157043457, + "kl": 0.2705078125, + "learning_rate": 8.458666666666666e-07, + "loss": 0.0108, + "reward": 1.3125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.8125, + "step": 2312 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.375, + "epoch": 0.3084, + "grad_norm": 10.027339935302734, + "kl": 0.4892578125, + "learning_rate": 8.457999999999999e-07, + "loss": 0.0196, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.875, + "step": 2313 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.75, + "epoch": 0.3085333333333333, + "grad_norm": 6.99530029296875, + "kl": 0.24560546875, + "learning_rate": 8.457333333333333e-07, + "loss": 0.0098, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2314 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.875, + "epoch": 0.30866666666666664, + "grad_norm": 5.1736674308776855, + "kl": 0.2041015625, + "learning_rate": 8.456666666666666e-07, + "loss": 0.0082, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2315 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.4375, + "epoch": 0.3088, + "grad_norm": 7.47581148147583, + "kl": 0.30615234375, + "learning_rate": 8.456e-07, + "loss": 0.0123, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2316 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.375, + "epoch": 0.30893333333333334, + "grad_norm": 39.757713317871094, + "kl": 0.32177734375, + "learning_rate": 8.455333333333333e-07, + "loss": 0.0129, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2317 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.0625, + "epoch": 0.30906666666666666, + "grad_norm": 7.938990592956543, + "kl": 0.2998046875, + "learning_rate": 8.454666666666667e-07, + "loss": 0.012, + "reward": 1.6875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 2318 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.25, + "epoch": 0.3092, + "grad_norm": 5.358563423156738, + "kl": 0.28515625, + "learning_rate": 8.454e-07, + "loss": 0.0114, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2319 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.0, + "epoch": 0.30933333333333335, + "grad_norm": 64.5884780883789, + "kl": 0.3212890625, + "learning_rate": 8.453333333333334e-07, + "loss": 0.0129, + "reward": 1.625, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 2320 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.0625, + "epoch": 0.30946666666666667, + "grad_norm": 16.365697860717773, + "kl": 0.5361328125, + "learning_rate": 8.452666666666667e-07, + "loss": 0.0214, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.875, + "step": 2321 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.4375, + "epoch": 0.3096, + "grad_norm": 7.042518615722656, + "kl": 0.35546875, + "learning_rate": 8.451999999999999e-07, + "loss": 0.0142, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2322 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.25, + "epoch": 0.30973333333333336, + "grad_norm": 7.659196376800537, + "kl": 0.328125, + "learning_rate": 8.451333333333333e-07, + "loss": 0.0131, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2323 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.3125, + "epoch": 0.3098666666666667, + "grad_norm": 12.149014472961426, + "kl": 0.3466796875, + "learning_rate": 8.450666666666666e-07, + "loss": 0.0139, + "reward": 1.0625, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.8125, + "step": 2324 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.625, + "epoch": 0.31, + "grad_norm": 0.49674656987190247, + "kl": 0.32421875, + "learning_rate": 8.45e-07, + "loss": 0.013, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2325 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.5, + "epoch": 0.3101333333333333, + "grad_norm": 0.747278094291687, + "kl": 0.359375, + "learning_rate": 8.449333333333332e-07, + "loss": 0.0144, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2326 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.0, + "epoch": 0.3102666666666667, + "grad_norm": 8.702515602111816, + "kl": 0.279296875, + "learning_rate": 8.448666666666666e-07, + "loss": 0.0112, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2327 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.375, + "epoch": 0.3104, + "grad_norm": 72.89041137695312, + "kl": 0.26171875, + "learning_rate": 8.447999999999999e-07, + "loss": 0.0105, + "reward": 1.0, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 2328 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.4375, + "epoch": 0.31053333333333333, + "grad_norm": 9.185784339904785, + "kl": 0.3310546875, + "learning_rate": 8.447333333333333e-07, + "loss": 0.0132, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 2329 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.25, + "epoch": 0.31066666666666665, + "grad_norm": 8.050271987915039, + "kl": 0.2421875, + "learning_rate": 8.446666666666666e-07, + "loss": 0.0097, + "reward": 1.4375, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2330 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.0625, + "epoch": 0.3108, + "grad_norm": 8.647148132324219, + "kl": 0.142578125, + "learning_rate": 8.445999999999999e-07, + "loss": 0.0057, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2331 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.6875, + "epoch": 0.31093333333333334, + "grad_norm": 3.651470184326172, + "kl": 0.2763671875, + "learning_rate": 8.445333333333333e-07, + "loss": 0.0111, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2332 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.5625, + "epoch": 0.31106666666666666, + "grad_norm": 9.186017990112305, + "kl": 0.24755859375, + "learning_rate": 8.444666666666666e-07, + "loss": 0.0099, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2333 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.0, + "epoch": 0.3112, + "grad_norm": 8.850600242614746, + "kl": 0.19091796875, + "learning_rate": 8.444e-07, + "loss": 0.0076, + "reward": 1.3125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 2334 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.75, + "epoch": 0.31133333333333335, + "grad_norm": 8.600275039672852, + "kl": 0.236328125, + "learning_rate": 8.443333333333333e-07, + "loss": 0.0094, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2335 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.375, + "epoch": 0.31146666666666667, + "grad_norm": 35.12355422973633, + "kl": 0.3564453125, + "learning_rate": 8.442666666666667e-07, + "loss": 0.0142, + "reward": 1.375, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2336 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.375, + "epoch": 0.3116, + "grad_norm": 9.827582359313965, + "kl": 0.353515625, + "learning_rate": 8.441999999999999e-07, + "loss": 0.0142, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2337 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.6875, + "epoch": 0.3117333333333333, + "grad_norm": 7.865074157714844, + "kl": 0.26611328125, + "learning_rate": 8.441333333333333e-07, + "loss": 0.0106, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2338 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.8125, + "epoch": 0.3118666666666667, + "grad_norm": 14.062100410461426, + "kl": 0.2548828125, + "learning_rate": 8.440666666666666e-07, + "loss": 0.0102, + "reward": 1.1875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 2339 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.9375, + "epoch": 0.312, + "grad_norm": 26.518831253051758, + "kl": 0.19091796875, + "learning_rate": 8.439999999999999e-07, + "loss": 0.0076, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 2340 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.8125, + "epoch": 0.3121333333333333, + "grad_norm": 7.094132423400879, + "kl": 0.265625, + "learning_rate": 8.439333333333333e-07, + "loss": 0.0106, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2341 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.25, + "epoch": 0.3122666666666667, + "grad_norm": 8.705432891845703, + "kl": 0.2255859375, + "learning_rate": 8.438666666666666e-07, + "loss": 0.009, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2342 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.25, + "epoch": 0.3124, + "grad_norm": 4.191123962402344, + "kl": 0.1748046875, + "learning_rate": 8.438e-07, + "loss": 0.007, + "reward": 1.0, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 2343 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.5, + "epoch": 0.31253333333333333, + "grad_norm": 5.4289398193359375, + "kl": 0.2587890625, + "learning_rate": 8.437333333333333e-07, + "loss": 0.0104, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2344 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.5, + "epoch": 0.31266666666666665, + "grad_norm": 5.393745422363281, + "kl": 0.1640625, + "learning_rate": 8.436666666666667e-07, + "loss": 0.0066, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2345 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.0, + "epoch": 0.3128, + "grad_norm": 16.197412490844727, + "kl": 0.1982421875, + "learning_rate": 8.436e-07, + "loss": 0.0079, + "reward": 1.25, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 2346 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.375, + "epoch": 0.31293333333333334, + "grad_norm": 8.274968147277832, + "kl": 0.193359375, + "learning_rate": 8.435333333333334e-07, + "loss": 0.0077, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2347 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.875, + "epoch": 0.31306666666666666, + "grad_norm": 7.194365978240967, + "kl": 0.19580078125, + "learning_rate": 8.434666666666666e-07, + "loss": 0.0078, + "reward": 1.6875, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 2348 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.125, + "epoch": 0.3132, + "grad_norm": 7.48644495010376, + "kl": 0.31494140625, + "learning_rate": 8.434e-07, + "loss": 0.0126, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 2349 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.125, + "epoch": 0.31333333333333335, + "grad_norm": 13.42468547821045, + "kl": 0.2236328125, + "learning_rate": 8.433333333333333e-07, + "loss": 0.009, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2350 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.375, + "epoch": 0.31346666666666667, + "grad_norm": 6.229968070983887, + "kl": 0.154052734375, + "learning_rate": 8.432666666666666e-07, + "loss": 0.0062, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2351 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.5625, + "epoch": 0.3136, + "grad_norm": 5.512455463409424, + "kl": 0.185546875, + "learning_rate": 8.431999999999999e-07, + "loss": 0.0074, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2352 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.75, + "epoch": 0.3137333333333333, + "grad_norm": 5.117140293121338, + "kl": 0.3115234375, + "learning_rate": 8.431333333333332e-07, + "loss": 0.0125, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 2353 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.0625, + "epoch": 0.3138666666666667, + "grad_norm": 6.626255512237549, + "kl": 0.228515625, + "learning_rate": 8.430666666666666e-07, + "loss": 0.0091, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2354 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.25, + "epoch": 0.314, + "grad_norm": 12.322070121765137, + "kl": 0.2080078125, + "learning_rate": 8.429999999999999e-07, + "loss": 0.0083, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 2355 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.5, + "epoch": 0.3141333333333333, + "grad_norm": 12.287871360778809, + "kl": 0.22509765625, + "learning_rate": 8.429333333333333e-07, + "loss": 0.009, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 2356 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.9375, + "epoch": 0.3142666666666667, + "grad_norm": 4.919610500335693, + "kl": 0.162109375, + "learning_rate": 8.428666666666666e-07, + "loss": 0.0065, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2357 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.75, + "epoch": 0.3144, + "grad_norm": 5.194915294647217, + "kl": 0.18212890625, + "learning_rate": 8.428e-07, + "loss": 0.0073, + "reward": 1.0625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 2358 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.625, + "epoch": 0.31453333333333333, + "grad_norm": 1.523969054222107, + "kl": 0.193359375, + "learning_rate": 8.427333333333333e-07, + "loss": 0.0077, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2359 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.625, + "epoch": 0.31466666666666665, + "grad_norm": 6.1558685302734375, + "kl": 0.2431640625, + "learning_rate": 8.426666666666666e-07, + "loss": 0.0097, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2360 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.6875, + "epoch": 0.3148, + "grad_norm": 12.944558143615723, + "kl": 0.33203125, + "learning_rate": 8.426e-07, + "loss": 0.0133, + "reward": 1.4375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2361 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.625, + "epoch": 0.31493333333333334, + "grad_norm": 15.769129753112793, + "kl": 0.27734375, + "learning_rate": 8.425333333333333e-07, + "loss": 0.0111, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2362 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.625, + "epoch": 0.31506666666666666, + "grad_norm": 12.461811065673828, + "kl": 0.251953125, + "learning_rate": 8.424666666666667e-07, + "loss": 0.01, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2363 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.8125, + "epoch": 0.3152, + "grad_norm": 0.30813995003700256, + "kl": 0.1875, + "learning_rate": 8.424e-07, + "loss": 0.0075, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2364 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.0625, + "epoch": 0.31533333333333335, + "grad_norm": 0.5518574118614197, + "kl": 0.2939453125, + "learning_rate": 8.423333333333334e-07, + "loss": 0.0117, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2365 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.6875, + "epoch": 0.3154666666666667, + "grad_norm": 4.2349724769592285, + "kl": 0.21630859375, + "learning_rate": 8.422666666666667e-07, + "loss": 0.0087, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2366 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.1875, + "epoch": 0.3156, + "grad_norm": 8.271180152893066, + "kl": 0.3203125, + "learning_rate": 8.422e-07, + "loss": 0.0128, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2367 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.0, + "epoch": 0.3157333333333333, + "grad_norm": 6.813545227050781, + "kl": 0.20361328125, + "learning_rate": 8.421333333333333e-07, + "loss": 0.0081, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2368 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.5, + "epoch": 0.3158666666666667, + "grad_norm": 6.426261901855469, + "kl": 0.16796875, + "learning_rate": 8.420666666666665e-07, + "loss": 0.0067, + "reward": 1.5625, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 2369 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.4375, + "epoch": 0.316, + "grad_norm": 10.161981582641602, + "kl": 0.2783203125, + "learning_rate": 8.419999999999999e-07, + "loss": 0.0112, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2370 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.9375, + "epoch": 0.3161333333333333, + "grad_norm": 8.589971542358398, + "kl": 0.23828125, + "learning_rate": 8.419333333333332e-07, + "loss": 0.0095, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 2371 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.6875, + "epoch": 0.31626666666666664, + "grad_norm": 0.5447818040847778, + "kl": 0.3095703125, + "learning_rate": 8.418666666666666e-07, + "loss": 0.0124, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2372 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.75, + "epoch": 0.3164, + "grad_norm": 10.023124694824219, + "kl": 0.224609375, + "learning_rate": 8.417999999999999e-07, + "loss": 0.009, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2373 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.5, + "epoch": 0.31653333333333333, + "grad_norm": 14.01383113861084, + "kl": 0.265625, + "learning_rate": 8.417333333333333e-07, + "loss": 0.0106, + "reward": 1.6875, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.875, + "step": 2374 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.0, + "epoch": 0.31666666666666665, + "grad_norm": 8.301352500915527, + "kl": 0.20947265625, + "learning_rate": 8.416666666666666e-07, + "loss": 0.0084, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2375 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.25, + "epoch": 0.3168, + "grad_norm": 4.3275885581970215, + "kl": 0.21728515625, + "learning_rate": 8.416e-07, + "loss": 0.0087, + "reward": 1.5, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 2376 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.6875, + "epoch": 0.31693333333333334, + "grad_norm": 7.2783894538879395, + "kl": 0.25244140625, + "learning_rate": 8.415333333333333e-07, + "loss": 0.0101, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2377 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.0625, + "epoch": 0.31706666666666666, + "grad_norm": 5.909369468688965, + "kl": 0.205078125, + "learning_rate": 8.414666666666667e-07, + "loss": 0.0082, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2378 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.5, + "epoch": 0.3172, + "grad_norm": 10.897028923034668, + "kl": 0.21142578125, + "learning_rate": 8.414e-07, + "loss": 0.0085, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2379 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.625, + "epoch": 0.31733333333333336, + "grad_norm": 0.8519197702407837, + "kl": 0.2490234375, + "learning_rate": 8.413333333333333e-07, + "loss": 0.0099, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2380 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.875, + "epoch": 0.3174666666666667, + "grad_norm": 8.263623237609863, + "kl": 0.21044921875, + "learning_rate": 8.412666666666667e-07, + "loss": 0.0084, + "reward": 1.4375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2381 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.4375, + "epoch": 0.3176, + "grad_norm": 14.471146583557129, + "kl": 0.279296875, + "learning_rate": 8.411999999999999e-07, + "loss": 0.0112, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2382 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.9375, + "epoch": 0.3177333333333333, + "grad_norm": 5.644404888153076, + "kl": 0.18994140625, + "learning_rate": 8.411333333333333e-07, + "loss": 0.0076, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2383 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.5, + "epoch": 0.3178666666666667, + "grad_norm": 8.046801567077637, + "kl": 0.20849609375, + "learning_rate": 8.410666666666666e-07, + "loss": 0.0084, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2384 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.5625, + "epoch": 0.318, + "grad_norm": 0.9200207591056824, + "kl": 0.328125, + "learning_rate": 8.41e-07, + "loss": 0.0131, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2385 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.375, + "epoch": 0.3181333333333333, + "grad_norm": 5.840371608734131, + "kl": 0.20263671875, + "learning_rate": 8.409333333333333e-07, + "loss": 0.0081, + "reward": 1.625, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 2386 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.1875, + "epoch": 0.31826666666666664, + "grad_norm": 11.032069206237793, + "kl": 0.404296875, + "learning_rate": 8.408666666666667e-07, + "loss": 0.0161, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2387 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.1875, + "epoch": 0.3184, + "grad_norm": 11.803810119628906, + "kl": 0.181640625, + "learning_rate": 8.408e-07, + "loss": 0.0073, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2388 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.75, + "epoch": 0.31853333333333333, + "grad_norm": 7.061022758483887, + "kl": 0.18896484375, + "learning_rate": 8.407333333333333e-07, + "loss": 0.0076, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2389 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.875, + "epoch": 0.31866666666666665, + "grad_norm": 7.0734148025512695, + "kl": 0.19091796875, + "learning_rate": 8.406666666666667e-07, + "loss": 0.0076, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2390 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.4375, + "epoch": 0.3188, + "grad_norm": 5.333688735961914, + "kl": 0.18505859375, + "learning_rate": 8.405999999999999e-07, + "loss": 0.0074, + "reward": 1.6875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 2391 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.0, + "epoch": 0.31893333333333335, + "grad_norm": 3.911795139312744, + "kl": 0.2119140625, + "learning_rate": 8.405333333333333e-07, + "loss": 0.0085, + "reward": 0.9375, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.875, + "step": 2392 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.25, + "epoch": 0.31906666666666667, + "grad_norm": 8.089877128601074, + "kl": 0.21044921875, + "learning_rate": 8.404666666666666e-07, + "loss": 0.0084, + "reward": 1.1875, + "reward_std": 0.8360271751880646, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.75, + "step": 2393 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.0, + "epoch": 0.3192, + "grad_norm": 34.093017578125, + "kl": 0.20263671875, + "learning_rate": 8.404e-07, + "loss": 0.0081, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2394 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.625, + "epoch": 0.31933333333333336, + "grad_norm": 14.59301471710205, + "kl": 0.2666015625, + "learning_rate": 8.403333333333333e-07, + "loss": 0.0107, + "reward": 0.9375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.8125, + "step": 2395 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.3125, + "epoch": 0.3194666666666667, + "grad_norm": 7.974295139312744, + "kl": 0.2744140625, + "learning_rate": 8.402666666666667e-07, + "loss": 0.011, + "reward": 1.25, + "reward_std": 0.9124869406223297, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.6875, + "step": 2396 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.25, + "epoch": 0.3196, + "grad_norm": 0.6315288543701172, + "kl": 0.25439453125, + "learning_rate": 8.401999999999999e-07, + "loss": 0.0102, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2397 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.5625, + "epoch": 0.3197333333333333, + "grad_norm": 6.44282341003418, + "kl": 0.25537109375, + "learning_rate": 8.401333333333332e-07, + "loss": 0.0102, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2398 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.875, + "epoch": 0.3198666666666667, + "grad_norm": 63.384925842285156, + "kl": 0.24169921875, + "learning_rate": 8.400666666666666e-07, + "loss": 0.0096, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 2399 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.8125, + "epoch": 0.32, + "grad_norm": 4.927130222320557, + "kl": 0.22412109375, + "learning_rate": 8.399999999999999e-07, + "loss": 0.009, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 2400 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.5625, + "epoch": 0.3201333333333333, + "grad_norm": 5.269084930419922, + "kl": 0.2734375, + "learning_rate": 8.399333333333333e-07, + "loss": 0.0109, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2401 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.875, + "epoch": 0.32026666666666664, + "grad_norm": 18.63452911376953, + "kl": 0.30224609375, + "learning_rate": 8.398666666666666e-07, + "loss": 0.0121, + "reward": 1.25, + "reward_std": 0.7490041851997375, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.8125, + "step": 2402 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.1875, + "epoch": 0.3204, + "grad_norm": 8.430185317993164, + "kl": 0.21533203125, + "learning_rate": 8.398e-07, + "loss": 0.0086, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2403 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.4375, + "epoch": 0.32053333333333334, + "grad_norm": 6.628543376922607, + "kl": 0.18798828125, + "learning_rate": 8.397333333333333e-07, + "loss": 0.0075, + "reward": 1.0625, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.875, + "step": 2404 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.875, + "epoch": 0.32066666666666666, + "grad_norm": 10.672905921936035, + "kl": 0.357421875, + "learning_rate": 8.396666666666667e-07, + "loss": 0.0143, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2405 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.875, + "epoch": 0.3208, + "grad_norm": 18.548768997192383, + "kl": 0.20556640625, + "learning_rate": 8.396e-07, + "loss": 0.0082, + "reward": 1.4375, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2406 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.3125, + "epoch": 0.32093333333333335, + "grad_norm": 6.579000473022461, + "kl": 0.25537109375, + "learning_rate": 8.395333333333333e-07, + "loss": 0.0102, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2407 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.25, + "epoch": 0.32106666666666667, + "grad_norm": 6.6750054359436035, + "kl": 0.20361328125, + "learning_rate": 8.394666666666667e-07, + "loss": 0.0082, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2408 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.375, + "epoch": 0.3212, + "grad_norm": 10.033549308776855, + "kl": 0.17919921875, + "learning_rate": 8.394e-07, + "loss": 0.0072, + "reward": 1.0, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.875, + "step": 2409 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.125, + "epoch": 0.32133333333333336, + "grad_norm": 8.545272827148438, + "kl": 0.2001953125, + "learning_rate": 8.393333333333334e-07, + "loss": 0.008, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2410 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.75, + "epoch": 0.3214666666666667, + "grad_norm": 8.295689582824707, + "kl": 0.253662109375, + "learning_rate": 8.392666666666667e-07, + "loss": 0.0101, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 2411 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.3125, + "epoch": 0.3216, + "grad_norm": 9.255606651306152, + "kl": 0.22705078125, + "learning_rate": 8.391999999999999e-07, + "loss": 0.0091, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 2412 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.5625, + "epoch": 0.3217333333333333, + "grad_norm": 6.023706436157227, + "kl": 0.15576171875, + "learning_rate": 8.391333333333332e-07, + "loss": 0.0062, + "reward": 1.25, + "reward_std": 0.6452257037162781, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 2413 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.125, + "epoch": 0.3218666666666667, + "grad_norm": 32.59972381591797, + "kl": 0.16357421875, + "learning_rate": 8.390666666666666e-07, + "loss": 0.0065, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2414 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.1875, + "epoch": 0.322, + "grad_norm": 50.025299072265625, + "kl": 0.26611328125, + "learning_rate": 8.389999999999999e-07, + "loss": 0.0107, + "reward": 1.25, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 2415 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.1875, + "epoch": 0.3221333333333333, + "grad_norm": 5.399568557739258, + "kl": 0.20263671875, + "learning_rate": 8.389333333333332e-07, + "loss": 0.0081, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2416 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.8125, + "epoch": 0.32226666666666665, + "grad_norm": 8.81025505065918, + "kl": 0.19873046875, + "learning_rate": 8.388666666666666e-07, + "loss": 0.008, + "reward": 1.6875, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 2417 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.5625, + "epoch": 0.3224, + "grad_norm": 12.443439483642578, + "kl": 0.15478515625, + "learning_rate": 8.387999999999999e-07, + "loss": 0.0062, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 2418 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.75, + "epoch": 0.32253333333333334, + "grad_norm": 6.178957462310791, + "kl": 0.19921875, + "learning_rate": 8.387333333333333e-07, + "loss": 0.008, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 2419 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.375, + "epoch": 0.32266666666666666, + "grad_norm": 10.3128080368042, + "kl": 0.22265625, + "learning_rate": 8.386666666666666e-07, + "loss": 0.0089, + "reward": 1.375, + "reward_std": 0.6924468874931335, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 2420 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.375, + "epoch": 0.3228, + "grad_norm": 4.3635029792785645, + "kl": 0.219482421875, + "learning_rate": 8.386e-07, + "loss": 0.0088, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 2421 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.75, + "epoch": 0.32293333333333335, + "grad_norm": 5.977145671844482, + "kl": 0.19921875, + "learning_rate": 8.385333333333333e-07, + "loss": 0.008, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2422 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.5625, + "epoch": 0.32306666666666667, + "grad_norm": 8.928860664367676, + "kl": 0.15625, + "learning_rate": 8.384666666666667e-07, + "loss": 0.0063, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2423 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.1875, + "epoch": 0.3232, + "grad_norm": 4.830140590667725, + "kl": 0.2236328125, + "learning_rate": 8.384e-07, + "loss": 0.009, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2424 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.875, + "epoch": 0.3233333333333333, + "grad_norm": 7.44024133682251, + "kl": 0.1669921875, + "learning_rate": 8.383333333333334e-07, + "loss": 0.0067, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 2425 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.8125, + "epoch": 0.3234666666666667, + "grad_norm": 4.722578525543213, + "kl": 0.12646484375, + "learning_rate": 8.382666666666667e-07, + "loss": 0.0051, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2426 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.4375, + "epoch": 0.3236, + "grad_norm": 4.933009147644043, + "kl": 0.18359375, + "learning_rate": 8.381999999999999e-07, + "loss": 0.0073, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2427 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.0625, + "epoch": 0.3237333333333333, + "grad_norm": 0.3018795847892761, + "kl": 0.155029296875, + "learning_rate": 8.381333333333333e-07, + "loss": 0.0062, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2428 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.1875, + "epoch": 0.3238666666666667, + "grad_norm": 3.7944607734680176, + "kl": 0.13916015625, + "learning_rate": 8.380666666666666e-07, + "loss": 0.0056, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 2429 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.125, + "epoch": 0.324, + "grad_norm": 6.94210147857666, + "kl": 0.15966796875, + "learning_rate": 8.38e-07, + "loss": 0.0064, + "reward": 1.25, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 2430 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.75, + "epoch": 0.32413333333333333, + "grad_norm": 6.0028181076049805, + "kl": 0.26318359375, + "learning_rate": 8.379333333333333e-07, + "loss": 0.0105, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 2431 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.0625, + "epoch": 0.32426666666666665, + "grad_norm": 7.26242208480835, + "kl": 0.2099609375, + "learning_rate": 8.378666666666667e-07, + "loss": 0.0084, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2432 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.0, + "epoch": 0.3244, + "grad_norm": 4.6100335121154785, + "kl": 0.25537109375, + "learning_rate": 8.377999999999999e-07, + "loss": 0.0102, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 2433 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.125, + "epoch": 0.32453333333333334, + "grad_norm": 7.9381585121154785, + "kl": 0.14404296875, + "learning_rate": 8.377333333333333e-07, + "loss": 0.0058, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2434 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.25, + "epoch": 0.32466666666666666, + "grad_norm": 7.176220893859863, + "kl": 0.17822265625, + "learning_rate": 8.376666666666666e-07, + "loss": 0.0071, + "reward": 1.625, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 2435 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.4375, + "epoch": 0.3248, + "grad_norm": 8.126144409179688, + "kl": 0.19580078125, + "learning_rate": 8.375999999999999e-07, + "loss": 0.0078, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2436 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.625, + "epoch": 0.32493333333333335, + "grad_norm": 4.3829874992370605, + "kl": 0.18896484375, + "learning_rate": 8.375333333333333e-07, + "loss": 0.0076, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2437 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.0625, + "epoch": 0.32506666666666667, + "grad_norm": 5.478480339050293, + "kl": 0.19580078125, + "learning_rate": 8.374666666666666e-07, + "loss": 0.0078, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2438 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.8125, + "epoch": 0.3252, + "grad_norm": 4.647100925445557, + "kl": 0.17724609375, + "learning_rate": 8.374e-07, + "loss": 0.0071, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2439 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.75, + "epoch": 0.3253333333333333, + "grad_norm": 4.1988115310668945, + "kl": 0.1943359375, + "learning_rate": 8.373333333333333e-07, + "loss": 0.0078, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 2440 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.5625, + "epoch": 0.3254666666666667, + "grad_norm": 5.873173236846924, + "kl": 0.14794921875, + "learning_rate": 8.372666666666667e-07, + "loss": 0.0059, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2441 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.25, + "epoch": 0.3256, + "grad_norm": 10.994976043701172, + "kl": 0.16259765625, + "learning_rate": 8.372e-07, + "loss": 0.0065, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2442 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.0625, + "epoch": 0.3257333333333333, + "grad_norm": 25.867971420288086, + "kl": 0.2529296875, + "learning_rate": 8.371333333333333e-07, + "loss": 0.0101, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 2443 + }, + { + "clip_ratio": 0.0, + "completion_length": 370.625, + "epoch": 0.3258666666666667, + "grad_norm": 6.812145709991455, + "kl": 0.173583984375, + "learning_rate": 8.370666666666666e-07, + "loss": 0.0069, + "reward": 1.4375, + "reward_std": 0.8349219560623169, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.8125, + "step": 2444 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.0625, + "epoch": 0.326, + "grad_norm": 4.900110721588135, + "kl": 0.19873046875, + "learning_rate": 8.369999999999999e-07, + "loss": 0.0079, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2445 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.875, + "epoch": 0.32613333333333333, + "grad_norm": 4.258100509643555, + "kl": 0.1728515625, + "learning_rate": 8.369333333333333e-07, + "loss": 0.0069, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2446 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.375, + "epoch": 0.32626666666666665, + "grad_norm": 7.797533988952637, + "kl": 0.1826171875, + "learning_rate": 8.368666666666666e-07, + "loss": 0.0073, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2447 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.875, + "epoch": 0.3264, + "grad_norm": 5.9201788902282715, + "kl": 0.2412109375, + "learning_rate": 8.368e-07, + "loss": 0.0097, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2448 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.6875, + "epoch": 0.32653333333333334, + "grad_norm": 7.84704065322876, + "kl": 0.22119140625, + "learning_rate": 8.367333333333333e-07, + "loss": 0.0088, + "reward": 1.3125, + "reward_std": 0.5876962244510651, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 2449 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.75, + "epoch": 0.32666666666666666, + "grad_norm": 4.833934307098389, + "kl": 0.17724609375, + "learning_rate": 8.366666666666667e-07, + "loss": 0.0071, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2450 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.25, + "epoch": 0.3268, + "grad_norm": 7.208478927612305, + "kl": 0.20361328125, + "learning_rate": 8.366e-07, + "loss": 0.0082, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2451 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.9375, + "epoch": 0.32693333333333335, + "grad_norm": 5.69592809677124, + "kl": 0.23095703125, + "learning_rate": 8.365333333333334e-07, + "loss": 0.0092, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2452 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.0625, + "epoch": 0.32706666666666667, + "grad_norm": 143.28504943847656, + "kl": 0.1611328125, + "learning_rate": 8.364666666666667e-07, + "loss": 0.0064, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2453 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.1875, + "epoch": 0.3272, + "grad_norm": 6.008325099945068, + "kl": 0.1298828125, + "learning_rate": 8.363999999999999e-07, + "loss": 0.0052, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2454 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.375, + "epoch": 0.3273333333333333, + "grad_norm": 23.89594268798828, + "kl": 0.17578125, + "learning_rate": 8.363333333333333e-07, + "loss": 0.007, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2455 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.125, + "epoch": 0.3274666666666667, + "grad_norm": 41.540122985839844, + "kl": 0.349609375, + "learning_rate": 8.362666666666666e-07, + "loss": 0.014, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 2456 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.0625, + "epoch": 0.3276, + "grad_norm": 8.1549072265625, + "kl": 0.29443359375, + "learning_rate": 8.362e-07, + "loss": 0.0118, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 2457 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.625, + "epoch": 0.3277333333333333, + "grad_norm": 6.794701099395752, + "kl": 0.16455078125, + "learning_rate": 8.361333333333332e-07, + "loss": 0.0066, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2458 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.3125, + "epoch": 0.32786666666666664, + "grad_norm": 7.742727756500244, + "kl": 0.1640625, + "learning_rate": 8.360666666666666e-07, + "loss": 0.0066, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2459 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.4375, + "epoch": 0.328, + "grad_norm": 6.83503532409668, + "kl": 0.2568359375, + "learning_rate": 8.359999999999999e-07, + "loss": 0.0102, + "reward": 1.0625, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 2460 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.25, + "epoch": 0.32813333333333333, + "grad_norm": 7.481444835662842, + "kl": 0.24365234375, + "learning_rate": 8.359333333333333e-07, + "loss": 0.0097, + "reward": 1.1875, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 2461 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.125, + "epoch": 0.32826666666666665, + "grad_norm": 4.044581890106201, + "kl": 0.111572265625, + "learning_rate": 8.358666666666666e-07, + "loss": 0.0045, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 2462 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.375, + "epoch": 0.3284, + "grad_norm": 0.31048354506492615, + "kl": 0.16259765625, + "learning_rate": 8.357999999999999e-07, + "loss": 0.0065, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2463 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.1875, + "epoch": 0.32853333333333334, + "grad_norm": 5.996483325958252, + "kl": 0.24658203125, + "learning_rate": 8.357333333333333e-07, + "loss": 0.0098, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2464 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.3125, + "epoch": 0.32866666666666666, + "grad_norm": 1599.7542724609375, + "kl": 0.2236328125, + "learning_rate": 8.356666666666666e-07, + "loss": 0.009, + "reward": 1.1875, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 2465 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.8125, + "epoch": 0.3288, + "grad_norm": 7.902216911315918, + "kl": 0.159912109375, + "learning_rate": 8.356e-07, + "loss": 0.0064, + "reward": 1.4375, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2466 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.5625, + "epoch": 0.32893333333333336, + "grad_norm": 6.068996429443359, + "kl": 0.19287109375, + "learning_rate": 8.355333333333333e-07, + "loss": 0.0077, + "reward": 1.6875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 2467 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.1875, + "epoch": 0.3290666666666667, + "grad_norm": 5.359971523284912, + "kl": 0.17431640625, + "learning_rate": 8.354666666666667e-07, + "loss": 0.007, + "reward": 1.5625, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 2468 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.25, + "epoch": 0.3292, + "grad_norm": 5.6801323890686035, + "kl": 0.17236328125, + "learning_rate": 8.354e-07, + "loss": 0.0069, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2469 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.1875, + "epoch": 0.3293333333333333, + "grad_norm": 5.401894569396973, + "kl": 0.21923828125, + "learning_rate": 8.353333333333334e-07, + "loss": 0.0088, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2470 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.8125, + "epoch": 0.3294666666666667, + "grad_norm": 7.889050483703613, + "kl": 0.166015625, + "learning_rate": 8.352666666666667e-07, + "loss": 0.0066, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2471 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.25, + "epoch": 0.3296, + "grad_norm": 5.28312349319458, + "kl": 0.17822265625, + "learning_rate": 8.352000000000001e-07, + "loss": 0.0071, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2472 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.875, + "epoch": 0.3297333333333333, + "grad_norm": 6.263187408447266, + "kl": 0.122802734375, + "learning_rate": 8.351333333333333e-07, + "loss": 0.0049, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2473 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.625, + "epoch": 0.32986666666666664, + "grad_norm": 7.702966690063477, + "kl": 0.201171875, + "learning_rate": 8.350666666666665e-07, + "loss": 0.0081, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 2474 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.1875, + "epoch": 0.33, + "grad_norm": 6.488882541656494, + "kl": 0.17822265625, + "learning_rate": 8.349999999999999e-07, + "loss": 0.0071, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2475 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.875, + "epoch": 0.33013333333333333, + "grad_norm": 5.266260147094727, + "kl": 0.21142578125, + "learning_rate": 8.349333333333332e-07, + "loss": 0.0084, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2476 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.125, + "epoch": 0.33026666666666665, + "grad_norm": 10.126060485839844, + "kl": 0.14990234375, + "learning_rate": 8.348666666666666e-07, + "loss": 0.006, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2477 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.0625, + "epoch": 0.3304, + "grad_norm": 6.510374069213867, + "kl": 0.22021484375, + "learning_rate": 8.347999999999999e-07, + "loss": 0.0088, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 2478 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.75, + "epoch": 0.33053333333333335, + "grad_norm": 5.529057502746582, + "kl": 0.1494140625, + "learning_rate": 8.347333333333333e-07, + "loss": 0.006, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 2479 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.3125, + "epoch": 0.33066666666666666, + "grad_norm": 6.987667083740234, + "kl": 0.240234375, + "learning_rate": 8.346666666666666e-07, + "loss": 0.0096, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2480 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.125, + "epoch": 0.3308, + "grad_norm": 6.157692909240723, + "kl": 0.2744140625, + "learning_rate": 8.346e-07, + "loss": 0.011, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2481 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.4375, + "epoch": 0.33093333333333336, + "grad_norm": 9.170190811157227, + "kl": 0.23046875, + "learning_rate": 8.345333333333333e-07, + "loss": 0.0092, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2482 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.4375, + "epoch": 0.3310666666666667, + "grad_norm": 5.991401195526123, + "kl": 0.2197265625, + "learning_rate": 8.344666666666666e-07, + "loss": 0.0088, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2483 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.375, + "epoch": 0.3312, + "grad_norm": 5.341361045837402, + "kl": 0.18798828125, + "learning_rate": 8.344e-07, + "loss": 0.0075, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2484 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.25, + "epoch": 0.3313333333333333, + "grad_norm": 17.86606788635254, + "kl": 0.20556640625, + "learning_rate": 8.343333333333333e-07, + "loss": 0.0082, + "reward": 1.1875, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 2485 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.25, + "epoch": 0.3314666666666667, + "grad_norm": 9.523198127746582, + "kl": 0.30078125, + "learning_rate": 8.342666666666667e-07, + "loss": 0.0121, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 2486 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.0625, + "epoch": 0.3316, + "grad_norm": 5.922714710235596, + "kl": 0.15283203125, + "learning_rate": 8.342e-07, + "loss": 0.0061, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2487 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.75, + "epoch": 0.3317333333333333, + "grad_norm": 12.553971290588379, + "kl": 0.2021484375, + "learning_rate": 8.341333333333333e-07, + "loss": 0.0081, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2488 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.25, + "epoch": 0.33186666666666664, + "grad_norm": 7.558230876922607, + "kl": 0.27734375, + "learning_rate": 8.340666666666666e-07, + "loss": 0.0111, + "reward": 1.375, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2489 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.875, + "epoch": 0.332, + "grad_norm": 8.622694969177246, + "kl": 0.2529296875, + "learning_rate": 8.34e-07, + "loss": 0.0101, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2490 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.25, + "epoch": 0.33213333333333334, + "grad_norm": 8.791379928588867, + "kl": 0.18359375, + "learning_rate": 8.339333333333333e-07, + "loss": 0.0073, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2491 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.5, + "epoch": 0.33226666666666665, + "grad_norm": 0.4506772458553314, + "kl": 0.2080078125, + "learning_rate": 8.338666666666666e-07, + "loss": 0.0083, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 2492 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.8125, + "epoch": 0.3324, + "grad_norm": 6.280642509460449, + "kl": 0.20947265625, + "learning_rate": 8.338e-07, + "loss": 0.0084, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2493 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.5, + "epoch": 0.33253333333333335, + "grad_norm": 3.8956966400146484, + "kl": 0.154296875, + "learning_rate": 8.337333333333333e-07, + "loss": 0.0062, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2494 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.625, + "epoch": 0.33266666666666667, + "grad_norm": 171.29725646972656, + "kl": 0.205078125, + "learning_rate": 8.336666666666667e-07, + "loss": 0.0082, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2495 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.6875, + "epoch": 0.3328, + "grad_norm": 13.417969703674316, + "kl": 0.3984375, + "learning_rate": 8.335999999999999e-07, + "loss": 0.0159, + "reward": 1.6875, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.875, + "step": 2496 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.25, + "epoch": 0.33293333333333336, + "grad_norm": 9.839095115661621, + "kl": 0.23583984375, + "learning_rate": 8.335333333333333e-07, + "loss": 0.0094, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2497 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.25, + "epoch": 0.3330666666666667, + "grad_norm": 0.4360743463039398, + "kl": 0.2216796875, + "learning_rate": 8.334666666666666e-07, + "loss": 0.0089, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2498 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.5, + "epoch": 0.3332, + "grad_norm": 6.318324089050293, + "kl": 0.2158203125, + "learning_rate": 8.334e-07, + "loss": 0.0086, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2499 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.875, + "epoch": 0.3333333333333333, + "grad_norm": 9.351192474365234, + "kl": 0.2109375, + "learning_rate": 8.333333333333333e-07, + "loss": 0.0084, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2500 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.1875, + "epoch": 0.3334666666666667, + "grad_norm": 4.6595683097839355, + "kl": 0.26220703125, + "learning_rate": 8.332666666666666e-07, + "loss": 0.0105, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2501 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.8125, + "epoch": 0.3336, + "grad_norm": 6.573859691619873, + "kl": 0.171875, + "learning_rate": 8.332e-07, + "loss": 0.0069, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2502 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.875, + "epoch": 0.3337333333333333, + "grad_norm": 12.796130180358887, + "kl": 0.294921875, + "learning_rate": 8.331333333333332e-07, + "loss": 0.0118, + "reward": 1.4375, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 2503 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.75, + "epoch": 0.33386666666666664, + "grad_norm": 9.950841903686523, + "kl": 0.22998046875, + "learning_rate": 8.330666666666666e-07, + "loss": 0.0092, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2504 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.125, + "epoch": 0.334, + "grad_norm": 6.3076171875, + "kl": 0.2265625, + "learning_rate": 8.329999999999999e-07, + "loss": 0.009, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2505 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.9375, + "epoch": 0.33413333333333334, + "grad_norm": 11.390685081481934, + "kl": 0.2109375, + "learning_rate": 8.329333333333333e-07, + "loss": 0.0084, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2506 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.25, + "epoch": 0.33426666666666666, + "grad_norm": 8.526390075683594, + "kl": 0.3603515625, + "learning_rate": 8.328666666666666e-07, + "loss": 0.0144, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2507 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.375, + "epoch": 0.3344, + "grad_norm": 7.5358099937438965, + "kl": 0.2861328125, + "learning_rate": 8.328e-07, + "loss": 0.0114, + "reward": 1.5625, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 2508 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.1875, + "epoch": 0.33453333333333335, + "grad_norm": 0.31536173820495605, + "kl": 0.162109375, + "learning_rate": 8.327333333333333e-07, + "loss": 0.0065, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2509 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.6875, + "epoch": 0.33466666666666667, + "grad_norm": 7.873368263244629, + "kl": 0.179931640625, + "learning_rate": 8.326666666666666e-07, + "loss": 0.0072, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2510 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.75, + "epoch": 0.3348, + "grad_norm": 0.35526353120803833, + "kl": 0.2109375, + "learning_rate": 8.326e-07, + "loss": 0.0084, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2511 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.375, + "epoch": 0.33493333333333336, + "grad_norm": 10.54922866821289, + "kl": 0.314453125, + "learning_rate": 8.325333333333333e-07, + "loss": 0.0126, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2512 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.3125, + "epoch": 0.3350666666666667, + "grad_norm": 7.9773054122924805, + "kl": 0.1513671875, + "learning_rate": 8.324666666666667e-07, + "loss": 0.006, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2513 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.5625, + "epoch": 0.3352, + "grad_norm": 5.473724842071533, + "kl": 0.18017578125, + "learning_rate": 8.324e-07, + "loss": 0.0072, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2514 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.4375, + "epoch": 0.3353333333333333, + "grad_norm": 17.021209716796875, + "kl": 0.12890625, + "learning_rate": 8.323333333333334e-07, + "loss": 0.0051, + "reward": 1.0625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 2515 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.0625, + "epoch": 0.3354666666666667, + "grad_norm": 5.238132953643799, + "kl": 0.1923828125, + "learning_rate": 8.322666666666667e-07, + "loss": 0.0077, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2516 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.1875, + "epoch": 0.3356, + "grad_norm": 6.344785213470459, + "kl": 0.1396484375, + "learning_rate": 8.322000000000001e-07, + "loss": 0.0056, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2517 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.5625, + "epoch": 0.33573333333333333, + "grad_norm": 4.601764678955078, + "kl": 0.15966796875, + "learning_rate": 8.321333333333332e-07, + "loss": 0.0064, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2518 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.9375, + "epoch": 0.33586666666666665, + "grad_norm": 17.208843231201172, + "kl": 0.1708984375, + "learning_rate": 8.320666666666665e-07, + "loss": 0.0068, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2519 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.1875, + "epoch": 0.336, + "grad_norm": 4.590977191925049, + "kl": 0.127685546875, + "learning_rate": 8.319999999999999e-07, + "loss": 0.0051, + "reward": 1.0, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 2520 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.0625, + "epoch": 0.33613333333333334, + "grad_norm": 12.459805488586426, + "kl": 0.22021484375, + "learning_rate": 8.319333333333332e-07, + "loss": 0.0088, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2521 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.875, + "epoch": 0.33626666666666666, + "grad_norm": 13.417500495910645, + "kl": 0.21240234375, + "learning_rate": 8.318666666666666e-07, + "loss": 0.0085, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2522 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.375, + "epoch": 0.3364, + "grad_norm": 4.678718566894531, + "kl": 0.2255859375, + "learning_rate": 8.317999999999999e-07, + "loss": 0.009, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 2523 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.1875, + "epoch": 0.33653333333333335, + "grad_norm": 9.351252555847168, + "kl": 0.3994140625, + "learning_rate": 8.317333333333333e-07, + "loss": 0.016, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2524 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.6875, + "epoch": 0.33666666666666667, + "grad_norm": 14.407407760620117, + "kl": 0.2470703125, + "learning_rate": 8.316666666666666e-07, + "loss": 0.0099, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2525 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.9375, + "epoch": 0.3368, + "grad_norm": 8.916619300842285, + "kl": 0.1787109375, + "learning_rate": 8.316e-07, + "loss": 0.0071, + "reward": 1.1875, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 2526 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.375, + "epoch": 0.3369333333333333, + "grad_norm": 6.078727722167969, + "kl": 0.15771484375, + "learning_rate": 8.315333333333333e-07, + "loss": 0.0063, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2527 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.8125, + "epoch": 0.3370666666666667, + "grad_norm": 5.606868267059326, + "kl": 0.1904296875, + "learning_rate": 8.314666666666667e-07, + "loss": 0.0076, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2528 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.4375, + "epoch": 0.3372, + "grad_norm": 8.12594223022461, + "kl": 0.143798828125, + "learning_rate": 8.314e-07, + "loss": 0.0058, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2529 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.25, + "epoch": 0.3373333333333333, + "grad_norm": 6.702122211456299, + "kl": 0.203125, + "learning_rate": 8.313333333333333e-07, + "loss": 0.0081, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2530 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.75, + "epoch": 0.3374666666666667, + "grad_norm": 0.42559900879859924, + "kl": 0.20947265625, + "learning_rate": 8.312666666666667e-07, + "loss": 0.0084, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2531 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.75, + "epoch": 0.3376, + "grad_norm": 5.792471408843994, + "kl": 0.174560546875, + "learning_rate": 8.312e-07, + "loss": 0.007, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2532 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.75, + "epoch": 0.33773333333333333, + "grad_norm": 7.748329162597656, + "kl": 0.13623046875, + "learning_rate": 8.311333333333333e-07, + "loss": 0.0054, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2533 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.1875, + "epoch": 0.33786666666666665, + "grad_norm": 4.170589447021484, + "kl": 0.22265625, + "learning_rate": 8.310666666666666e-07, + "loss": 0.0089, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 2534 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.375, + "epoch": 0.338, + "grad_norm": 8.060178756713867, + "kl": 0.18603515625, + "learning_rate": 8.31e-07, + "loss": 0.0074, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2535 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.0625, + "epoch": 0.33813333333333334, + "grad_norm": 0.6376252770423889, + "kl": 0.25146484375, + "learning_rate": 8.309333333333333e-07, + "loss": 0.0101, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2536 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.875, + "epoch": 0.33826666666666666, + "grad_norm": 8.550968170166016, + "kl": 0.203125, + "learning_rate": 8.308666666666667e-07, + "loss": 0.0081, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2537 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.625, + "epoch": 0.3384, + "grad_norm": 7.9361252784729, + "kl": 0.25634765625, + "learning_rate": 8.308e-07, + "loss": 0.0103, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2538 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.125, + "epoch": 0.33853333333333335, + "grad_norm": 10.423552513122559, + "kl": 0.248046875, + "learning_rate": 8.307333333333332e-07, + "loss": 0.0099, + "reward": 1.5625, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 2539 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.125, + "epoch": 0.33866666666666667, + "grad_norm": 4.856717109680176, + "kl": 0.16748046875, + "learning_rate": 8.306666666666666e-07, + "loss": 0.0067, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2540 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.125, + "epoch": 0.3388, + "grad_norm": 7.674258232116699, + "kl": 0.18896484375, + "learning_rate": 8.305999999999999e-07, + "loss": 0.0076, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2541 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.625, + "epoch": 0.3389333333333333, + "grad_norm": 7.053342342376709, + "kl": 0.18017578125, + "learning_rate": 8.305333333333333e-07, + "loss": 0.0072, + "reward": 1.3125, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 2542 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.25, + "epoch": 0.3390666666666667, + "grad_norm": 7.578158378601074, + "kl": 0.158203125, + "learning_rate": 8.304666666666666e-07, + "loss": 0.0063, + "reward": 1.3125, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 2543 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.8125, + "epoch": 0.3392, + "grad_norm": 9.118197441101074, + "kl": 0.23779296875, + "learning_rate": 8.304e-07, + "loss": 0.0095, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2544 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.625, + "epoch": 0.3393333333333333, + "grad_norm": 12.836730003356934, + "kl": 0.17529296875, + "learning_rate": 8.303333333333333e-07, + "loss": 0.007, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2545 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.4375, + "epoch": 0.3394666666666667, + "grad_norm": 7.765941619873047, + "kl": 0.1767578125, + "learning_rate": 8.302666666666667e-07, + "loss": 0.0071, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2546 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.0625, + "epoch": 0.3396, + "grad_norm": 8.375266075134277, + "kl": 0.1533203125, + "learning_rate": 8.302e-07, + "loss": 0.0061, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2547 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.25, + "epoch": 0.33973333333333333, + "grad_norm": 6.95817756652832, + "kl": 0.2822265625, + "learning_rate": 8.301333333333332e-07, + "loss": 0.0113, + "reward": 1.6875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 2548 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.1875, + "epoch": 0.33986666666666665, + "grad_norm": 16.787992477416992, + "kl": 0.2001953125, + "learning_rate": 8.300666666666666e-07, + "loss": 0.008, + "reward": 1.6875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 2549 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.6875, + "epoch": 0.34, + "grad_norm": 8.086292266845703, + "kl": 0.13720703125, + "learning_rate": 8.299999999999999e-07, + "loss": 0.0055, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 2550 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.4375, + "epoch": 0.34013333333333334, + "grad_norm": 32.98393630981445, + "kl": 0.2109375, + "learning_rate": 8.299333333333333e-07, + "loss": 0.0084, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2551 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.1875, + "epoch": 0.34026666666666666, + "grad_norm": 2.996229410171509, + "kl": 0.201171875, + "learning_rate": 8.298666666666666e-07, + "loss": 0.0081, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2552 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.875, + "epoch": 0.3404, + "grad_norm": 9.67513656616211, + "kl": 0.26171875, + "learning_rate": 8.298e-07, + "loss": 0.0105, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 2553 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.1875, + "epoch": 0.34053333333333335, + "grad_norm": 7.152886867523193, + "kl": 0.18115234375, + "learning_rate": 8.297333333333333e-07, + "loss": 0.0073, + "reward": 1.125, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 2554 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.8125, + "epoch": 0.3406666666666667, + "grad_norm": 18.44702911376953, + "kl": 0.17724609375, + "learning_rate": 8.296666666666667e-07, + "loss": 0.0071, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2555 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.875, + "epoch": 0.3408, + "grad_norm": 7.122483253479004, + "kl": 0.15478515625, + "learning_rate": 8.296e-07, + "loss": 0.0062, + "reward": 1.0, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.875, + "step": 2556 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.25, + "epoch": 0.3409333333333333, + "grad_norm": 58.90892791748047, + "kl": 0.152099609375, + "learning_rate": 8.295333333333333e-07, + "loss": 0.0061, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2557 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.3125, + "epoch": 0.3410666666666667, + "grad_norm": 6.893876075744629, + "kl": 0.12939453125, + "learning_rate": 8.294666666666667e-07, + "loss": 0.0052, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2558 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.4375, + "epoch": 0.3412, + "grad_norm": 3.327582597732544, + "kl": 0.23388671875, + "learning_rate": 8.294e-07, + "loss": 0.0093, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2559 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.375, + "epoch": 0.3413333333333333, + "grad_norm": 6.373438835144043, + "kl": 0.23046875, + "learning_rate": 8.293333333333333e-07, + "loss": 0.0092, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2560 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.5625, + "epoch": 0.34146666666666664, + "grad_norm": 6.872705459594727, + "kl": 0.162109375, + "learning_rate": 8.292666666666666e-07, + "loss": 0.0065, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 2561 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.5, + "epoch": 0.3416, + "grad_norm": 8.23244571685791, + "kl": 0.1875, + "learning_rate": 8.292e-07, + "loss": 0.0075, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2562 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.4375, + "epoch": 0.34173333333333333, + "grad_norm": 8.036413192749023, + "kl": 0.228515625, + "learning_rate": 8.291333333333333e-07, + "loss": 0.0091, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2563 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.6875, + "epoch": 0.34186666666666665, + "grad_norm": 7.4193854331970215, + "kl": 0.189453125, + "learning_rate": 8.290666666666666e-07, + "loss": 0.0076, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 2564 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.625, + "epoch": 0.342, + "grad_norm": 5.749351978302002, + "kl": 0.1484375, + "learning_rate": 8.289999999999999e-07, + "loss": 0.0059, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2565 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.125, + "epoch": 0.34213333333333334, + "grad_norm": 8.192626953125, + "kl": 0.14990234375, + "learning_rate": 8.289333333333332e-07, + "loss": 0.006, + "reward": 1.6875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 2566 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.875, + "epoch": 0.34226666666666666, + "grad_norm": 15.770835876464844, + "kl": 0.279296875, + "learning_rate": 8.288666666666666e-07, + "loss": 0.0112, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2567 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.0, + "epoch": 0.3424, + "grad_norm": 8.907415390014648, + "kl": 0.1953125, + "learning_rate": 8.287999999999999e-07, + "loss": 0.0078, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2568 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.25, + "epoch": 0.34253333333333336, + "grad_norm": 8.580342292785645, + "kl": 0.16748046875, + "learning_rate": 8.287333333333333e-07, + "loss": 0.0067, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2569 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.25, + "epoch": 0.3426666666666667, + "grad_norm": 8.805251121520996, + "kl": 0.1787109375, + "learning_rate": 8.286666666666666e-07, + "loss": 0.0072, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2570 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.75, + "epoch": 0.3428, + "grad_norm": 10.813457489013672, + "kl": 0.1494140625, + "learning_rate": 8.286e-07, + "loss": 0.006, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2571 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.375, + "epoch": 0.3429333333333333, + "grad_norm": 6.7066569328308105, + "kl": 0.144775390625, + "learning_rate": 8.285333333333333e-07, + "loss": 0.0058, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2572 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.8125, + "epoch": 0.3430666666666667, + "grad_norm": 7.9543633460998535, + "kl": 0.166015625, + "learning_rate": 8.284666666666667e-07, + "loss": 0.0066, + "reward": 1.3125, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 2573 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.5625, + "epoch": 0.3432, + "grad_norm": 7.478440284729004, + "kl": 0.24267578125, + "learning_rate": 8.284e-07, + "loss": 0.0097, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 2574 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.0, + "epoch": 0.3433333333333333, + "grad_norm": 6.823399066925049, + "kl": 0.3037109375, + "learning_rate": 8.283333333333334e-07, + "loss": 0.0121, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2575 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.375, + "epoch": 0.34346666666666664, + "grad_norm": 7.98619270324707, + "kl": 0.1572265625, + "learning_rate": 8.282666666666667e-07, + "loss": 0.0063, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2576 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.9375, + "epoch": 0.3436, + "grad_norm": 4.415018558502197, + "kl": 0.171875, + "learning_rate": 8.282e-07, + "loss": 0.0069, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2577 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.1875, + "epoch": 0.34373333333333334, + "grad_norm": 7.106647491455078, + "kl": 0.1728515625, + "learning_rate": 8.281333333333334e-07, + "loss": 0.0069, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2578 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.3125, + "epoch": 0.34386666666666665, + "grad_norm": 11.66353988647461, + "kl": 0.18017578125, + "learning_rate": 8.280666666666666e-07, + "loss": 0.0072, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2579 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.8125, + "epoch": 0.344, + "grad_norm": 4.408806324005127, + "kl": 0.215576171875, + "learning_rate": 8.28e-07, + "loss": 0.0086, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2580 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.625, + "epoch": 0.34413333333333335, + "grad_norm": 7.574824810028076, + "kl": 0.23291015625, + "learning_rate": 8.279333333333332e-07, + "loss": 0.0093, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2581 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.25, + "epoch": 0.34426666666666667, + "grad_norm": 7.232400894165039, + "kl": 0.16455078125, + "learning_rate": 8.278666666666666e-07, + "loss": 0.0066, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2582 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.1875, + "epoch": 0.3444, + "grad_norm": 5.441911220550537, + "kl": 0.18359375, + "learning_rate": 8.277999999999999e-07, + "loss": 0.0073, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2583 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.5, + "epoch": 0.34453333333333336, + "grad_norm": 7.203457355499268, + "kl": 0.162109375, + "learning_rate": 8.277333333333333e-07, + "loss": 0.0065, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2584 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.1875, + "epoch": 0.3446666666666667, + "grad_norm": 8.815382957458496, + "kl": 0.24462890625, + "learning_rate": 8.276666666666666e-07, + "loss": 0.0098, + "reward": 1.625, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 2585 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.9375, + "epoch": 0.3448, + "grad_norm": 5.8128204345703125, + "kl": 0.2626953125, + "learning_rate": 8.275999999999999e-07, + "loss": 0.0105, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2586 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.0625, + "epoch": 0.3449333333333333, + "grad_norm": 7.543790817260742, + "kl": 0.251220703125, + "learning_rate": 8.275333333333333e-07, + "loss": 0.0101, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 2587 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.5, + "epoch": 0.3450666666666667, + "grad_norm": 5.0531487464904785, + "kl": 0.1279296875, + "learning_rate": 8.274666666666666e-07, + "loss": 0.0051, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2588 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.375, + "epoch": 0.3452, + "grad_norm": 8.118072509765625, + "kl": 0.1650390625, + "learning_rate": 8.274e-07, + "loss": 0.0066, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 2589 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.125, + "epoch": 0.3453333333333333, + "grad_norm": 6.503601551055908, + "kl": 0.126220703125, + "learning_rate": 8.273333333333333e-07, + "loss": 0.005, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 2590 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.8125, + "epoch": 0.34546666666666664, + "grad_norm": 4.742255210876465, + "kl": 0.1572265625, + "learning_rate": 8.272666666666667e-07, + "loss": 0.0063, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2591 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.3125, + "epoch": 0.3456, + "grad_norm": 9.672138214111328, + "kl": 0.267578125, + "learning_rate": 8.272e-07, + "loss": 0.0107, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2592 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.0625, + "epoch": 0.34573333333333334, + "grad_norm": 6.7807135581970215, + "kl": 0.14208984375, + "learning_rate": 8.271333333333334e-07, + "loss": 0.0057, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2593 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.5625, + "epoch": 0.34586666666666666, + "grad_norm": 6.353598117828369, + "kl": 0.17138671875, + "learning_rate": 8.270666666666666e-07, + "loss": 0.0068, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2594 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.625, + "epoch": 0.346, + "grad_norm": 8.61186408996582, + "kl": 0.1689453125, + "learning_rate": 8.269999999999999e-07, + "loss": 0.0068, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2595 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.0, + "epoch": 0.34613333333333335, + "grad_norm": 7.56928014755249, + "kl": 0.14990234375, + "learning_rate": 8.269333333333333e-07, + "loss": 0.006, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 2596 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.25, + "epoch": 0.34626666666666667, + "grad_norm": 5.190825939178467, + "kl": 0.1962890625, + "learning_rate": 8.268666666666666e-07, + "loss": 0.0079, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2597 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.0, + "epoch": 0.3464, + "grad_norm": 9.606184005737305, + "kl": 0.20947265625, + "learning_rate": 8.268e-07, + "loss": 0.0084, + "reward": 1.0625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 2598 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.75, + "epoch": 0.34653333333333336, + "grad_norm": 8.878372192382812, + "kl": 0.2119140625, + "learning_rate": 8.267333333333333e-07, + "loss": 0.0085, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2599 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.3125, + "epoch": 0.3466666666666667, + "grad_norm": 7.656772613525391, + "kl": 0.162109375, + "learning_rate": 8.266666666666667e-07, + "loss": 0.0065, + "reward": 1.4375, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2600 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.0, + "epoch": 0.3468, + "grad_norm": 5.242384910583496, + "kl": 0.22216796875, + "learning_rate": 8.266e-07, + "loss": 0.0089, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2601 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.9375, + "epoch": 0.3469333333333333, + "grad_norm": 12.252285957336426, + "kl": 0.162109375, + "learning_rate": 8.265333333333333e-07, + "loss": 0.0065, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2602 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.125, + "epoch": 0.3470666666666667, + "grad_norm": 10.031885147094727, + "kl": 0.21875, + "learning_rate": 8.264666666666666e-07, + "loss": 0.0088, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2603 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.6875, + "epoch": 0.3472, + "grad_norm": 6.282846927642822, + "kl": 0.18359375, + "learning_rate": 8.263999999999999e-07, + "loss": 0.0073, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2604 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.8125, + "epoch": 0.3473333333333333, + "grad_norm": 5.439013957977295, + "kl": 0.212890625, + "learning_rate": 8.263333333333333e-07, + "loss": 0.0085, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2605 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.0625, + "epoch": 0.34746666666666665, + "grad_norm": 12.497836112976074, + "kl": 0.19189453125, + "learning_rate": 8.262666666666666e-07, + "loss": 0.0077, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2606 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.625, + "epoch": 0.3476, + "grad_norm": 6.940851211547852, + "kl": 0.21142578125, + "learning_rate": 8.262e-07, + "loss": 0.0085, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2607 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.0, + "epoch": 0.34773333333333334, + "grad_norm": 6.161384582519531, + "kl": 0.17626953125, + "learning_rate": 8.261333333333333e-07, + "loss": 0.007, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2608 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.125, + "epoch": 0.34786666666666666, + "grad_norm": 8.554142951965332, + "kl": 0.129150390625, + "learning_rate": 8.260666666666666e-07, + "loss": 0.0052, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2609 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.875, + "epoch": 0.348, + "grad_norm": 1.9912042617797852, + "kl": 0.2119140625, + "learning_rate": 8.259999999999999e-07, + "loss": 0.0085, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2610 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.5, + "epoch": 0.34813333333333335, + "grad_norm": 9.568450927734375, + "kl": 0.2802734375, + "learning_rate": 8.259333333333333e-07, + "loss": 0.0112, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2611 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.3125, + "epoch": 0.34826666666666667, + "grad_norm": 9.175553321838379, + "kl": 0.1376953125, + "learning_rate": 8.258666666666666e-07, + "loss": 0.0055, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2612 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.5, + "epoch": 0.3484, + "grad_norm": 8.143617630004883, + "kl": 0.2919921875, + "learning_rate": 8.257999999999999e-07, + "loss": 0.0117, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2613 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.3125, + "epoch": 0.3485333333333333, + "grad_norm": 7.6807074546813965, + "kl": 0.1640625, + "learning_rate": 8.257333333333333e-07, + "loss": 0.0066, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2614 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.1875, + "epoch": 0.3486666666666667, + "grad_norm": 7.282520294189453, + "kl": 0.251953125, + "learning_rate": 8.256666666666666e-07, + "loss": 0.0101, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2615 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.75, + "epoch": 0.3488, + "grad_norm": 8.359587669372559, + "kl": 0.156982421875, + "learning_rate": 8.256e-07, + "loss": 0.0063, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2616 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.5625, + "epoch": 0.3489333333333333, + "grad_norm": 8.805624961853027, + "kl": 0.143310546875, + "learning_rate": 8.255333333333333e-07, + "loss": 0.0057, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2617 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.625, + "epoch": 0.3490666666666667, + "grad_norm": 8.341593742370605, + "kl": 0.17919921875, + "learning_rate": 8.254666666666667e-07, + "loss": 0.0072, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2618 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.8125, + "epoch": 0.3492, + "grad_norm": 7.201435089111328, + "kl": 0.20458984375, + "learning_rate": 8.254e-07, + "loss": 0.0082, + "reward": 1.625, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 2619 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.375, + "epoch": 0.34933333333333333, + "grad_norm": 7.027004718780518, + "kl": 0.17333984375, + "learning_rate": 8.253333333333334e-07, + "loss": 0.007, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2620 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.125, + "epoch": 0.34946666666666665, + "grad_norm": 7.914317607879639, + "kl": 0.2001953125, + "learning_rate": 8.252666666666667e-07, + "loss": 0.008, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2621 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.625, + "epoch": 0.3496, + "grad_norm": 3.6283586025238037, + "kl": 0.13330078125, + "learning_rate": 8.252000000000001e-07, + "loss": 0.0053, + "reward": 1.6875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 2622 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.375, + "epoch": 0.34973333333333334, + "grad_norm": 33.86553192138672, + "kl": 0.2041015625, + "learning_rate": 8.251333333333334e-07, + "loss": 0.0082, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2623 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.3125, + "epoch": 0.34986666666666666, + "grad_norm": 8.17764949798584, + "kl": 0.2255859375, + "learning_rate": 8.250666666666665e-07, + "loss": 0.009, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2624 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.4375, + "epoch": 0.35, + "grad_norm": 5.115823268890381, + "kl": 0.19189453125, + "learning_rate": 8.249999999999999e-07, + "loss": 0.0077, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2625 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.9375, + "epoch": 0.35013333333333335, + "grad_norm": 8.116397857666016, + "kl": 0.17236328125, + "learning_rate": 8.249333333333332e-07, + "loss": 0.0069, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2626 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.9375, + "epoch": 0.35026666666666667, + "grad_norm": 4.455536365509033, + "kl": 0.16455078125, + "learning_rate": 8.248666666666666e-07, + "loss": 0.0066, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2627 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.875, + "epoch": 0.3504, + "grad_norm": 16.249441146850586, + "kl": 0.17724609375, + "learning_rate": 8.247999999999999e-07, + "loss": 0.0071, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2628 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.4375, + "epoch": 0.3505333333333333, + "grad_norm": 4.423569202423096, + "kl": 0.160888671875, + "learning_rate": 8.247333333333333e-07, + "loss": 0.0064, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2629 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.1875, + "epoch": 0.3506666666666667, + "grad_norm": 5.30263614654541, + "kl": 0.330078125, + "learning_rate": 8.246666666666666e-07, + "loss": 0.0132, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2630 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.9375, + "epoch": 0.3508, + "grad_norm": 20.42301368713379, + "kl": 0.26171875, + "learning_rate": 8.246e-07, + "loss": 0.0104, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.875, + "step": 2631 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.75, + "epoch": 0.3509333333333333, + "grad_norm": 11.17013168334961, + "kl": 0.19384765625, + "learning_rate": 8.245333333333333e-07, + "loss": 0.0078, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2632 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.125, + "epoch": 0.3510666666666667, + "grad_norm": 12.089456558227539, + "kl": 0.236328125, + "learning_rate": 8.244666666666666e-07, + "loss": 0.0094, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2633 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.9375, + "epoch": 0.3512, + "grad_norm": 8.808185577392578, + "kl": 0.1201171875, + "learning_rate": 8.244e-07, + "loss": 0.0048, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2634 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.625, + "epoch": 0.35133333333333333, + "grad_norm": 12.432708740234375, + "kl": 0.13330078125, + "learning_rate": 8.243333333333333e-07, + "loss": 0.0053, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2635 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.0, + "epoch": 0.35146666666666665, + "grad_norm": 0.6048902869224548, + "kl": 0.185546875, + "learning_rate": 8.242666666666667e-07, + "loss": 0.0074, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2636 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.25, + "epoch": 0.3516, + "grad_norm": 9.346151351928711, + "kl": 0.16015625, + "learning_rate": 8.242e-07, + "loss": 0.0064, + "reward": 1.5625, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 2637 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.1875, + "epoch": 0.35173333333333334, + "grad_norm": 8.885621070861816, + "kl": 0.408203125, + "learning_rate": 8.241333333333334e-07, + "loss": 0.0163, + "reward": 1.5625, + "reward_std": 0.7216846346855164, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 2638 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.8125, + "epoch": 0.35186666666666666, + "grad_norm": 8.809639930725098, + "kl": 0.29931640625, + "learning_rate": 8.240666666666666e-07, + "loss": 0.012, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2639 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.6875, + "epoch": 0.352, + "grad_norm": 7.526425361633301, + "kl": 0.21923828125, + "learning_rate": 8.24e-07, + "loss": 0.0088, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 2640 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.9375, + "epoch": 0.35213333333333335, + "grad_norm": 5.2081074714660645, + "kl": 0.2109375, + "learning_rate": 8.239333333333333e-07, + "loss": 0.0084, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2641 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.0625, + "epoch": 0.3522666666666667, + "grad_norm": 6.748868465423584, + "kl": 0.13916015625, + "learning_rate": 8.238666666666666e-07, + "loss": 0.0056, + "reward": 1.5, + "reward_std": 0.6452257037162781, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2642 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.8125, + "epoch": 0.3524, + "grad_norm": 22.067989349365234, + "kl": 0.96875, + "learning_rate": 8.238e-07, + "loss": 0.0388, + "reward": 1.1875, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 2643 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.8125, + "epoch": 0.3525333333333333, + "grad_norm": 12.838095664978027, + "kl": 0.14599609375, + "learning_rate": 8.237333333333332e-07, + "loss": 0.0058, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2644 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.75, + "epoch": 0.3526666666666667, + "grad_norm": 9.512320518493652, + "kl": 0.25048828125, + "learning_rate": 8.236666666666666e-07, + "loss": 0.01, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2645 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.1875, + "epoch": 0.3528, + "grad_norm": 6.600358963012695, + "kl": 0.16064453125, + "learning_rate": 8.235999999999999e-07, + "loss": 0.0064, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2646 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.3125, + "epoch": 0.3529333333333333, + "grad_norm": 9.048861503601074, + "kl": 0.1640625, + "learning_rate": 8.235333333333333e-07, + "loss": 0.0066, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2647 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.3125, + "epoch": 0.35306666666666664, + "grad_norm": 8.33203125, + "kl": 0.4052734375, + "learning_rate": 8.234666666666666e-07, + "loss": 0.0162, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2648 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.375, + "epoch": 0.3532, + "grad_norm": 18.0208683013916, + "kl": 0.2822265625, + "learning_rate": 8.234e-07, + "loss": 0.0113, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2649 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.9375, + "epoch": 0.35333333333333333, + "grad_norm": 6.1119489669799805, + "kl": 0.16064453125, + "learning_rate": 8.233333333333333e-07, + "loss": 0.0064, + "reward": 1.1875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 2650 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.4375, + "epoch": 0.35346666666666665, + "grad_norm": 6.134164333343506, + "kl": 0.15771484375, + "learning_rate": 8.232666666666666e-07, + "loss": 0.0063, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2651 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.0625, + "epoch": 0.3536, + "grad_norm": 7.695834636688232, + "kl": 0.19189453125, + "learning_rate": 8.232e-07, + "loss": 0.0077, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2652 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.625, + "epoch": 0.35373333333333334, + "grad_norm": 5.374007701873779, + "kl": 0.21875, + "learning_rate": 8.231333333333333e-07, + "loss": 0.0087, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2653 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.375, + "epoch": 0.35386666666666666, + "grad_norm": 53.949737548828125, + "kl": 0.2412109375, + "learning_rate": 8.230666666666666e-07, + "loss": 0.0097, + "reward": 1.1875, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.8125, + "step": 2654 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.5, + "epoch": 0.354, + "grad_norm": 7.903242588043213, + "kl": 0.193359375, + "learning_rate": 8.229999999999999e-07, + "loss": 0.0078, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2655 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.4375, + "epoch": 0.35413333333333336, + "grad_norm": 8.300972938537598, + "kl": 0.2626953125, + "learning_rate": 8.229333333333333e-07, + "loss": 0.0105, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2656 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.625, + "epoch": 0.3542666666666667, + "grad_norm": 7.991018772125244, + "kl": 0.18701171875, + "learning_rate": 8.228666666666666e-07, + "loss": 0.0075, + "reward": 1.5, + "reward_std": 0.7440237998962402, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 2657 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.4375, + "epoch": 0.3544, + "grad_norm": 4.153321743011475, + "kl": 0.15087890625, + "learning_rate": 8.228e-07, + "loss": 0.006, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2658 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.5, + "epoch": 0.3545333333333333, + "grad_norm": 6.788488864898682, + "kl": 0.162109375, + "learning_rate": 8.227333333333333e-07, + "loss": 0.0065, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2659 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.625, + "epoch": 0.3546666666666667, + "grad_norm": 7.486552715301514, + "kl": 0.2001953125, + "learning_rate": 8.226666666666666e-07, + "loss": 0.008, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2660 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.5, + "epoch": 0.3548, + "grad_norm": 6.0044989585876465, + "kl": 0.16357421875, + "learning_rate": 8.226e-07, + "loss": 0.0065, + "reward": 1.5625, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 2661 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.75, + "epoch": 0.3549333333333333, + "grad_norm": 8.391535758972168, + "kl": 0.2041015625, + "learning_rate": 8.225333333333333e-07, + "loss": 0.0082, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 2662 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.1875, + "epoch": 0.35506666666666664, + "grad_norm": 8.118978500366211, + "kl": 0.205078125, + "learning_rate": 8.224666666666667e-07, + "loss": 0.0082, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2663 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.375, + "epoch": 0.3552, + "grad_norm": 9.90708065032959, + "kl": 0.2744140625, + "learning_rate": 8.224e-07, + "loss": 0.011, + "reward": 1.0625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.875, + "step": 2664 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.3125, + "epoch": 0.35533333333333333, + "grad_norm": 5.050914764404297, + "kl": 0.178466796875, + "learning_rate": 8.223333333333334e-07, + "loss": 0.0071, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2665 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.125, + "epoch": 0.35546666666666665, + "grad_norm": 7.808152675628662, + "kl": 0.16064453125, + "learning_rate": 8.222666666666666e-07, + "loss": 0.0064, + "reward": 1.5, + "reward_std": 0.7071067690849304, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 2666 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.5, + "epoch": 0.3556, + "grad_norm": 4.43066930770874, + "kl": 0.23486328125, + "learning_rate": 8.222e-07, + "loss": 0.0094, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2667 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.125, + "epoch": 0.35573333333333335, + "grad_norm": 9.101678848266602, + "kl": 0.18017578125, + "learning_rate": 8.221333333333333e-07, + "loss": 0.0072, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2668 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.875, + "epoch": 0.35586666666666666, + "grad_norm": 5.412226676940918, + "kl": 0.255859375, + "learning_rate": 8.220666666666665e-07, + "loss": 0.0102, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 2669 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.6875, + "epoch": 0.356, + "grad_norm": 8.534835815429688, + "kl": 0.29296875, + "learning_rate": 8.219999999999999e-07, + "loss": 0.0117, + "reward": 1.375, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2670 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.0, + "epoch": 0.35613333333333336, + "grad_norm": 6.479403495788574, + "kl": 0.132080078125, + "learning_rate": 8.219333333333332e-07, + "loss": 0.0053, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2671 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.5625, + "epoch": 0.3562666666666667, + "grad_norm": 8.390820503234863, + "kl": 0.1630859375, + "learning_rate": 8.218666666666666e-07, + "loss": 0.0065, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2672 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.1875, + "epoch": 0.3564, + "grad_norm": 6.065777778625488, + "kl": 0.18310546875, + "learning_rate": 8.217999999999999e-07, + "loss": 0.0073, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2673 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.8125, + "epoch": 0.3565333333333333, + "grad_norm": 9.449637413024902, + "kl": 0.30859375, + "learning_rate": 8.217333333333333e-07, + "loss": 0.0124, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2674 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.1875, + "epoch": 0.3566666666666667, + "grad_norm": 12.401108741760254, + "kl": 0.41748046875, + "learning_rate": 8.216666666666666e-07, + "loss": 0.0167, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 2675 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.1875, + "epoch": 0.3568, + "grad_norm": 5.058191299438477, + "kl": 0.16845703125, + "learning_rate": 8.216e-07, + "loss": 0.0067, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2676 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.4375, + "epoch": 0.3569333333333333, + "grad_norm": 10.668092727661133, + "kl": 0.23193359375, + "learning_rate": 8.215333333333333e-07, + "loss": 0.0093, + "reward": 1.4375, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2677 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.5625, + "epoch": 0.35706666666666664, + "grad_norm": 11.614983558654785, + "kl": 0.2568359375, + "learning_rate": 8.214666666666667e-07, + "loss": 0.0103, + "reward": 1.1875, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 2678 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.1875, + "epoch": 0.3572, + "grad_norm": 0.4646831750869751, + "kl": 0.265625, + "learning_rate": 8.214e-07, + "loss": 0.0106, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2679 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.5625, + "epoch": 0.35733333333333334, + "grad_norm": 7.405875205993652, + "kl": 0.158203125, + "learning_rate": 8.213333333333333e-07, + "loss": 0.0063, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2680 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.4375, + "epoch": 0.35746666666666665, + "grad_norm": 8.535709381103516, + "kl": 0.177734375, + "learning_rate": 8.212666666666667e-07, + "loss": 0.0071, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2681 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.75, + "epoch": 0.3576, + "grad_norm": 7.137031555175781, + "kl": 0.15478515625, + "learning_rate": 8.212e-07, + "loss": 0.0062, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2682 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.8125, + "epoch": 0.35773333333333335, + "grad_norm": 12.34476089477539, + "kl": 0.21484375, + "learning_rate": 8.211333333333334e-07, + "loss": 0.0086, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2683 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.3125, + "epoch": 0.35786666666666667, + "grad_norm": 8.700749397277832, + "kl": 0.1904296875, + "learning_rate": 8.210666666666666e-07, + "loss": 0.0076, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2684 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.5625, + "epoch": 0.358, + "grad_norm": 13.308663368225098, + "kl": 0.24951171875, + "learning_rate": 8.21e-07, + "loss": 0.01, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2685 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.625, + "epoch": 0.35813333333333336, + "grad_norm": 5.141202449798584, + "kl": 0.18359375, + "learning_rate": 8.209333333333332e-07, + "loss": 0.0073, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2686 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.3125, + "epoch": 0.3582666666666667, + "grad_norm": 7.810483932495117, + "kl": 0.2548828125, + "learning_rate": 8.208666666666666e-07, + "loss": 0.0102, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2687 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.75, + "epoch": 0.3584, + "grad_norm": 9.074911117553711, + "kl": 0.3427734375, + "learning_rate": 8.207999999999999e-07, + "loss": 0.0137, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 2688 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.375, + "epoch": 0.3585333333333333, + "grad_norm": 4.357283592224121, + "kl": 0.2119140625, + "learning_rate": 8.207333333333332e-07, + "loss": 0.0085, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2689 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.0625, + "epoch": 0.3586666666666667, + "grad_norm": 7.807704448699951, + "kl": 0.166015625, + "learning_rate": 8.206666666666666e-07, + "loss": 0.0066, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2690 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.0, + "epoch": 0.3588, + "grad_norm": 4.7199177742004395, + "kl": 0.2431640625, + "learning_rate": 8.205999999999999e-07, + "loss": 0.0097, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2691 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.8125, + "epoch": 0.3589333333333333, + "grad_norm": 27.073532104492188, + "kl": 0.20751953125, + "learning_rate": 8.205333333333333e-07, + "loss": 0.0083, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2692 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.875, + "epoch": 0.35906666666666665, + "grad_norm": 7.686323165893555, + "kl": 0.18017578125, + "learning_rate": 8.204666666666666e-07, + "loss": 0.0072, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2693 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.75, + "epoch": 0.3592, + "grad_norm": 9.27499008178711, + "kl": 0.19091796875, + "learning_rate": 8.204e-07, + "loss": 0.0076, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2694 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.4375, + "epoch": 0.35933333333333334, + "grad_norm": 6.109932899475098, + "kl": 0.19580078125, + "learning_rate": 8.203333333333333e-07, + "loss": 0.0078, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2695 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.8125, + "epoch": 0.35946666666666666, + "grad_norm": 7.591004371643066, + "kl": 0.19189453125, + "learning_rate": 8.202666666666667e-07, + "loss": 0.0077, + "reward": 1.125, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 2696 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.5, + "epoch": 0.3596, + "grad_norm": 6.788517951965332, + "kl": 0.23876953125, + "learning_rate": 8.202e-07, + "loss": 0.0095, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2697 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.0, + "epoch": 0.35973333333333335, + "grad_norm": 8.099176406860352, + "kl": 0.22265625, + "learning_rate": 8.201333333333333e-07, + "loss": 0.0089, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2698 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.875, + "epoch": 0.35986666666666667, + "grad_norm": 7.106849670410156, + "kl": 0.140625, + "learning_rate": 8.200666666666667e-07, + "loss": 0.0056, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 2699 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.0625, + "epoch": 0.36, + "grad_norm": 9.741843223571777, + "kl": 0.24560546875, + "learning_rate": 8.199999999999999e-07, + "loss": 0.0098, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2700 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.1875, + "epoch": 0.36013333333333336, + "grad_norm": 11.169212341308594, + "kl": 0.22802734375, + "learning_rate": 8.199333333333333e-07, + "loss": 0.0091, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2701 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.125, + "epoch": 0.3602666666666667, + "grad_norm": 11.852680206298828, + "kl": 0.462890625, + "learning_rate": 8.198666666666666e-07, + "loss": 0.0184, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.875, + "step": 2702 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.5625, + "epoch": 0.3604, + "grad_norm": 6.14545202255249, + "kl": 0.19970703125, + "learning_rate": 8.198e-07, + "loss": 0.008, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2703 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.3125, + "epoch": 0.3605333333333333, + "grad_norm": 8.526412963867188, + "kl": 0.19140625, + "learning_rate": 8.197333333333333e-07, + "loss": 0.0077, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2704 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.8125, + "epoch": 0.3606666666666667, + "grad_norm": 3.9311468601226807, + "kl": 0.16064453125, + "learning_rate": 8.196666666666667e-07, + "loss": 0.0064, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2705 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.9375, + "epoch": 0.3608, + "grad_norm": 6.280090808868408, + "kl": 0.17626953125, + "learning_rate": 8.196e-07, + "loss": 0.0071, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2706 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.25, + "epoch": 0.36093333333333333, + "grad_norm": 7.360044956207275, + "kl": 0.25732421875, + "learning_rate": 8.195333333333332e-07, + "loss": 0.0103, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2707 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.0625, + "epoch": 0.36106666666666665, + "grad_norm": 7.155009746551514, + "kl": 0.166015625, + "learning_rate": 8.194666666666666e-07, + "loss": 0.0066, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2708 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.5625, + "epoch": 0.3612, + "grad_norm": 8.7000093460083, + "kl": 0.216796875, + "learning_rate": 8.193999999999999e-07, + "loss": 0.0086, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2709 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.1875, + "epoch": 0.36133333333333334, + "grad_norm": 7.424136638641357, + "kl": 0.18994140625, + "learning_rate": 8.193333333333333e-07, + "loss": 0.0076, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2710 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.6875, + "epoch": 0.36146666666666666, + "grad_norm": 9.524513244628906, + "kl": 0.18212890625, + "learning_rate": 8.192666666666666e-07, + "loss": 0.0073, + "reward": 1.125, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 2711 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.5, + "epoch": 0.3616, + "grad_norm": 7.4097113609313965, + "kl": 0.152587890625, + "learning_rate": 8.192e-07, + "loss": 0.0061, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2712 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.1875, + "epoch": 0.36173333333333335, + "grad_norm": 4.896218299865723, + "kl": 0.15771484375, + "learning_rate": 8.191333333333333e-07, + "loss": 0.0063, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2713 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.6875, + "epoch": 0.36186666666666667, + "grad_norm": 8.399391174316406, + "kl": 0.25537109375, + "learning_rate": 8.190666666666667e-07, + "loss": 0.0102, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2714 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.875, + "epoch": 0.362, + "grad_norm": 6.940490245819092, + "kl": 0.1943359375, + "learning_rate": 8.189999999999999e-07, + "loss": 0.0078, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 2715 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.75, + "epoch": 0.3621333333333333, + "grad_norm": 4.644935131072998, + "kl": 0.19482421875, + "learning_rate": 8.189333333333332e-07, + "loss": 0.0078, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2716 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.4375, + "epoch": 0.3622666666666667, + "grad_norm": 7.99090576171875, + "kl": 0.525390625, + "learning_rate": 8.188666666666666e-07, + "loss": 0.021, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2717 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.25, + "epoch": 0.3624, + "grad_norm": 13.692928314208984, + "kl": 0.132080078125, + "learning_rate": 8.187999999999999e-07, + "loss": 0.0053, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2718 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.5625, + "epoch": 0.3625333333333333, + "grad_norm": 0.44598668813705444, + "kl": 0.23876953125, + "learning_rate": 8.187333333333333e-07, + "loss": 0.0095, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2719 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.375, + "epoch": 0.3626666666666667, + "grad_norm": 8.058231353759766, + "kl": 0.17236328125, + "learning_rate": 8.186666666666666e-07, + "loss": 0.0069, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2720 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.375, + "epoch": 0.3628, + "grad_norm": 8.25452995300293, + "kl": 0.24267578125, + "learning_rate": 8.186e-07, + "loss": 0.0097, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2721 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.0625, + "epoch": 0.36293333333333333, + "grad_norm": 5.197573184967041, + "kl": 0.15673828125, + "learning_rate": 8.185333333333333e-07, + "loss": 0.0063, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2722 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.3125, + "epoch": 0.36306666666666665, + "grad_norm": 5.6372222900390625, + "kl": 0.2705078125, + "learning_rate": 8.184666666666667e-07, + "loss": 0.0108, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2723 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.9375, + "epoch": 0.3632, + "grad_norm": 8.670086860656738, + "kl": 0.14794921875, + "learning_rate": 8.184e-07, + "loss": 0.0059, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2724 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.4375, + "epoch": 0.36333333333333334, + "grad_norm": 10.775789260864258, + "kl": 0.3583984375, + "learning_rate": 8.183333333333334e-07, + "loss": 0.0143, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 2725 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.625, + "epoch": 0.36346666666666666, + "grad_norm": 23.17957305908203, + "kl": 0.197265625, + "learning_rate": 8.182666666666667e-07, + "loss": 0.0079, + "reward": 1.3125, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 2726 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.4375, + "epoch": 0.3636, + "grad_norm": 8.045486450195312, + "kl": 0.205078125, + "learning_rate": 8.182e-07, + "loss": 0.0082, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 2727 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.4375, + "epoch": 0.36373333333333335, + "grad_norm": 6.660168647766113, + "kl": 0.26171875, + "learning_rate": 8.181333333333334e-07, + "loss": 0.0105, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2728 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.25, + "epoch": 0.36386666666666667, + "grad_norm": 186.75814819335938, + "kl": 0.139404296875, + "learning_rate": 8.180666666666666e-07, + "loss": 0.0056, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2729 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.0, + "epoch": 0.364, + "grad_norm": 9.76386547088623, + "kl": 0.3125, + "learning_rate": 8.179999999999999e-07, + "loss": 0.0125, + "reward": 1.3125, + "reward_std": 0.6983994543552399, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 2730 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.0, + "epoch": 0.3641333333333333, + "grad_norm": 10.503154754638672, + "kl": 0.22119140625, + "learning_rate": 8.179333333333332e-07, + "loss": 0.0089, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 2731 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.25, + "epoch": 0.3642666666666667, + "grad_norm": 2.3267765045166016, + "kl": 0.3408203125, + "learning_rate": 8.178666666666666e-07, + "loss": 0.0136, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2732 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.875, + "epoch": 0.3644, + "grad_norm": 7.588278770446777, + "kl": 0.2041015625, + "learning_rate": 8.177999999999999e-07, + "loss": 0.0082, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2733 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.4375, + "epoch": 0.3645333333333333, + "grad_norm": 11.07486343383789, + "kl": 0.3173828125, + "learning_rate": 8.177333333333333e-07, + "loss": 0.0127, + "reward": 1.3125, + "reward_std": 0.8152145147323608, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.8125, + "step": 2734 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.4375, + "epoch": 0.36466666666666664, + "grad_norm": 12.301335334777832, + "kl": 0.4443359375, + "learning_rate": 8.176666666666666e-07, + "loss": 0.0178, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 2735 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.5625, + "epoch": 0.3648, + "grad_norm": 5.034670352935791, + "kl": 0.19873046875, + "learning_rate": 8.175999999999999e-07, + "loss": 0.008, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 2736 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.75, + "epoch": 0.36493333333333333, + "grad_norm": 6.169975757598877, + "kl": 0.23193359375, + "learning_rate": 8.175333333333333e-07, + "loss": 0.0093, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2737 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.4375, + "epoch": 0.36506666666666665, + "grad_norm": 6.117397308349609, + "kl": 0.2294921875, + "learning_rate": 8.174666666666666e-07, + "loss": 0.0092, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2738 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.1875, + "epoch": 0.3652, + "grad_norm": 11.989480972290039, + "kl": 0.140380859375, + "learning_rate": 8.174e-07, + "loss": 0.0056, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2739 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.0, + "epoch": 0.36533333333333334, + "grad_norm": 0.6854764223098755, + "kl": 0.3359375, + "learning_rate": 8.173333333333333e-07, + "loss": 0.0134, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2740 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.0625, + "epoch": 0.36546666666666666, + "grad_norm": 5.154837131500244, + "kl": 0.251953125, + "learning_rate": 8.172666666666667e-07, + "loss": 0.0101, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2741 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.75, + "epoch": 0.3656, + "grad_norm": 0.7626264691352844, + "kl": 0.2001953125, + "learning_rate": 8.172e-07, + "loss": 0.008, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2742 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.875, + "epoch": 0.36573333333333335, + "grad_norm": 10.816156387329102, + "kl": 0.380859375, + "learning_rate": 8.171333333333334e-07, + "loss": 0.0152, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2743 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.625, + "epoch": 0.3658666666666667, + "grad_norm": 7.129312515258789, + "kl": 0.2626953125, + "learning_rate": 8.170666666666667e-07, + "loss": 0.0105, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2744 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.0, + "epoch": 0.366, + "grad_norm": 4.648766040802002, + "kl": 0.1484375, + "learning_rate": 8.169999999999999e-07, + "loss": 0.0059, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2745 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.6875, + "epoch": 0.3661333333333333, + "grad_norm": 8.379192352294922, + "kl": 0.236328125, + "learning_rate": 8.169333333333333e-07, + "loss": 0.0095, + "reward": 1.4375, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2746 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.625, + "epoch": 0.3662666666666667, + "grad_norm": 8.496088981628418, + "kl": 0.28515625, + "learning_rate": 8.168666666666666e-07, + "loss": 0.0114, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2747 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.4375, + "epoch": 0.3664, + "grad_norm": 15.854646682739258, + "kl": 0.28466796875, + "learning_rate": 8.168e-07, + "loss": 0.0114, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2748 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.4375, + "epoch": 0.3665333333333333, + "grad_norm": 9.042591094970703, + "kl": 0.20458984375, + "learning_rate": 8.167333333333332e-07, + "loss": 0.0082, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2749 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.25, + "epoch": 0.36666666666666664, + "grad_norm": 10.366133689880371, + "kl": 0.17919921875, + "learning_rate": 8.166666666666666e-07, + "loss": 0.0072, + "reward": 1.625, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 2750 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.6875, + "epoch": 0.3668, + "grad_norm": 8.161622047424316, + "kl": 0.16357421875, + "learning_rate": 8.165999999999999e-07, + "loss": 0.0065, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2751 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.75, + "epoch": 0.36693333333333333, + "grad_norm": 9.395075798034668, + "kl": 0.1982421875, + "learning_rate": 8.165333333333333e-07, + "loss": 0.0079, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 2752 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.4375, + "epoch": 0.36706666666666665, + "grad_norm": 5.8928022384643555, + "kl": 0.2470703125, + "learning_rate": 8.164666666666666e-07, + "loss": 0.0099, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 2753 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.5625, + "epoch": 0.3672, + "grad_norm": 5.798623085021973, + "kl": 0.3232421875, + "learning_rate": 8.163999999999999e-07, + "loss": 0.013, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2754 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.8125, + "epoch": 0.36733333333333335, + "grad_norm": 9.52734375, + "kl": 0.2119140625, + "learning_rate": 8.163333333333333e-07, + "loss": 0.0085, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 2755 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.0, + "epoch": 0.36746666666666666, + "grad_norm": 8.704648971557617, + "kl": 0.2734375, + "learning_rate": 8.162666666666666e-07, + "loss": 0.0109, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2756 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.1875, + "epoch": 0.3676, + "grad_norm": 6.773966312408447, + "kl": 0.19384765625, + "learning_rate": 8.162e-07, + "loss": 0.0077, + "reward": 1.625, + "reward_std": 0.4432026147842407, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 2757 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.0, + "epoch": 0.36773333333333336, + "grad_norm": 0.48015403747558594, + "kl": 0.2158203125, + "learning_rate": 8.161333333333333e-07, + "loss": 0.0086, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2758 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.6875, + "epoch": 0.3678666666666667, + "grad_norm": 6.39434814453125, + "kl": 0.26416015625, + "learning_rate": 8.160666666666667e-07, + "loss": 0.0106, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2759 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.3125, + "epoch": 0.368, + "grad_norm": 10.612141609191895, + "kl": 0.49609375, + "learning_rate": 8.159999999999999e-07, + "loss": 0.0198, + "reward": 1.625, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 2760 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.25, + "epoch": 0.3681333333333333, + "grad_norm": 7.870477199554443, + "kl": 0.150390625, + "learning_rate": 8.159333333333333e-07, + "loss": 0.006, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2761 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.375, + "epoch": 0.3682666666666667, + "grad_norm": 48.190521240234375, + "kl": 0.27783203125, + "learning_rate": 8.158666666666666e-07, + "loss": 0.0111, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2762 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.75, + "epoch": 0.3684, + "grad_norm": 5.15291690826416, + "kl": 0.19384765625, + "learning_rate": 8.157999999999999e-07, + "loss": 0.0078, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2763 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.5625, + "epoch": 0.3685333333333333, + "grad_norm": 5.933975696563721, + "kl": 0.224609375, + "learning_rate": 8.157333333333333e-07, + "loss": 0.009, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2764 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.75, + "epoch": 0.36866666666666664, + "grad_norm": 8.650486946105957, + "kl": 0.23486328125, + "learning_rate": 8.156666666666666e-07, + "loss": 0.0094, + "reward": 1.1875, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 2765 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.0, + "epoch": 0.3688, + "grad_norm": 7.341052055358887, + "kl": 0.23388671875, + "learning_rate": 8.156e-07, + "loss": 0.0094, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 2766 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.6875, + "epoch": 0.36893333333333334, + "grad_norm": 9.140198707580566, + "kl": 0.31640625, + "learning_rate": 8.155333333333333e-07, + "loss": 0.0127, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 2767 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.375, + "epoch": 0.36906666666666665, + "grad_norm": 6.843516826629639, + "kl": 0.232421875, + "learning_rate": 8.154666666666667e-07, + "loss": 0.0093, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2768 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.0625, + "epoch": 0.3692, + "grad_norm": 0.4088277518749237, + "kl": 0.23876953125, + "learning_rate": 8.154e-07, + "loss": 0.0095, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2769 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.125, + "epoch": 0.36933333333333335, + "grad_norm": 6.981982231140137, + "kl": 0.28466796875, + "learning_rate": 8.153333333333334e-07, + "loss": 0.0114, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2770 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.8125, + "epoch": 0.36946666666666667, + "grad_norm": 8.666601181030273, + "kl": 0.16748046875, + "learning_rate": 8.152666666666666e-07, + "loss": 0.0067, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2771 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.625, + "epoch": 0.3696, + "grad_norm": 15.675992965698242, + "kl": 0.328125, + "learning_rate": 8.152e-07, + "loss": 0.0131, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2772 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.4375, + "epoch": 0.36973333333333336, + "grad_norm": 9.598918914794922, + "kl": 0.24609375, + "learning_rate": 8.151333333333333e-07, + "loss": 0.0099, + "reward": 1.375, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2773 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.375, + "epoch": 0.3698666666666667, + "grad_norm": 9.294700622558594, + "kl": 0.1552734375, + "learning_rate": 8.150666666666666e-07, + "loss": 0.0062, + "reward": 1.1875, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 2774 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.1875, + "epoch": 0.37, + "grad_norm": 8.119827270507812, + "kl": 0.17822265625, + "learning_rate": 8.149999999999999e-07, + "loss": 0.0071, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2775 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.6875, + "epoch": 0.3701333333333333, + "grad_norm": 8.507399559020996, + "kl": 0.164794921875, + "learning_rate": 8.149333333333332e-07, + "loss": 0.0066, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2776 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.5625, + "epoch": 0.3702666666666667, + "grad_norm": 0.4126998484134674, + "kl": 0.135986328125, + "learning_rate": 8.148666666666666e-07, + "loss": 0.0054, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 2777 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.5, + "epoch": 0.3704, + "grad_norm": 5.968639373779297, + "kl": 0.128173828125, + "learning_rate": 8.147999999999999e-07, + "loss": 0.0051, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2778 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.3125, + "epoch": 0.3705333333333333, + "grad_norm": 7.136787414550781, + "kl": 0.20703125, + "learning_rate": 8.147333333333333e-07, + "loss": 0.0083, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2779 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.125, + "epoch": 0.37066666666666664, + "grad_norm": 8.843564987182617, + "kl": 0.24609375, + "learning_rate": 8.146666666666666e-07, + "loss": 0.0099, + "reward": 1.25, + "reward_std": 0.6760360598564148, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 2780 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.1875, + "epoch": 0.3708, + "grad_norm": 7.188713550567627, + "kl": 0.294921875, + "learning_rate": 8.146e-07, + "loss": 0.0118, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2781 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.6875, + "epoch": 0.37093333333333334, + "grad_norm": 11.350226402282715, + "kl": 0.251953125, + "learning_rate": 8.145333333333333e-07, + "loss": 0.0101, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2782 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.375, + "epoch": 0.37106666666666666, + "grad_norm": 4.657131671905518, + "kl": 0.30029296875, + "learning_rate": 8.144666666666666e-07, + "loss": 0.012, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2783 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.25, + "epoch": 0.3712, + "grad_norm": 10.158525466918945, + "kl": 0.14208984375, + "learning_rate": 8.144e-07, + "loss": 0.0057, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2784 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.8125, + "epoch": 0.37133333333333335, + "grad_norm": 8.171286582946777, + "kl": 0.20361328125, + "learning_rate": 8.143333333333333e-07, + "loss": 0.0081, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 2785 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.9375, + "epoch": 0.37146666666666667, + "grad_norm": 4.591734409332275, + "kl": 0.212890625, + "learning_rate": 8.142666666666667e-07, + "loss": 0.0085, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2786 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.0625, + "epoch": 0.3716, + "grad_norm": 7.473776340484619, + "kl": 0.203125, + "learning_rate": 8.142e-07, + "loss": 0.0081, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2787 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.0625, + "epoch": 0.37173333333333336, + "grad_norm": 5.4534196853637695, + "kl": 0.30517578125, + "learning_rate": 8.141333333333334e-07, + "loss": 0.0122, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2788 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.875, + "epoch": 0.3718666666666667, + "grad_norm": 8.062438011169434, + "kl": 0.211669921875, + "learning_rate": 8.140666666666667e-07, + "loss": 0.0085, + "reward": 1.3125, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 2789 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.75, + "epoch": 0.372, + "grad_norm": 3.359683036804199, + "kl": 0.23828125, + "learning_rate": 8.14e-07, + "loss": 0.0095, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2790 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.875, + "epoch": 0.3721333333333333, + "grad_norm": 9.711092948913574, + "kl": 0.33740234375, + "learning_rate": 8.139333333333332e-07, + "loss": 0.0135, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2791 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.75, + "epoch": 0.3722666666666667, + "grad_norm": 7.75446891784668, + "kl": 0.21435546875, + "learning_rate": 8.138666666666665e-07, + "loss": 0.0086, + "reward": 1.25, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 2792 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.4375, + "epoch": 0.3724, + "grad_norm": 6.6777873039245605, + "kl": 0.240234375, + "learning_rate": 8.137999999999999e-07, + "loss": 0.0096, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2793 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.625, + "epoch": 0.3725333333333333, + "grad_norm": 15.770329475402832, + "kl": 0.201171875, + "learning_rate": 8.137333333333332e-07, + "loss": 0.008, + "reward": 1.25, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 2794 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.5625, + "epoch": 0.37266666666666665, + "grad_norm": 6.798732280731201, + "kl": 0.1708984375, + "learning_rate": 8.136666666666666e-07, + "loss": 0.0068, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2795 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.625, + "epoch": 0.3728, + "grad_norm": 11.967657089233398, + "kl": 0.27734375, + "learning_rate": 8.135999999999999e-07, + "loss": 0.0111, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2796 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.375, + "epoch": 0.37293333333333334, + "grad_norm": 4.753871917724609, + "kl": 0.13818359375, + "learning_rate": 8.135333333333333e-07, + "loss": 0.0055, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2797 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.0, + "epoch": 0.37306666666666666, + "grad_norm": 15.23923397064209, + "kl": 0.4228515625, + "learning_rate": 8.134666666666666e-07, + "loss": 0.0169, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2798 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.1875, + "epoch": 0.3732, + "grad_norm": 11.023826599121094, + "kl": 0.33203125, + "learning_rate": 8.134e-07, + "loss": 0.0133, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2799 + }, + { + "clip_ratio": 0.0, + "completion_length": 44.75, + "epoch": 0.37333333333333335, + "grad_norm": 13.267797470092773, + "kl": 0.314453125, + "learning_rate": 8.133333333333333e-07, + "loss": 0.0126, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2800 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.25, + "epoch": 0.37346666666666667, + "grad_norm": 12.667831420898438, + "kl": 0.4130859375, + "learning_rate": 8.132666666666666e-07, + "loss": 0.0166, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 2801 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.4375, + "epoch": 0.3736, + "grad_norm": 7.32610559463501, + "kl": 0.1513671875, + "learning_rate": 8.132e-07, + "loss": 0.0061, + "reward": 1.3125, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 2802 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.1875, + "epoch": 0.3737333333333333, + "grad_norm": 7.230432987213135, + "kl": 0.4208984375, + "learning_rate": 8.131333333333333e-07, + "loss": 0.0168, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2803 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.0, + "epoch": 0.3738666666666667, + "grad_norm": 9.879120826721191, + "kl": 0.32421875, + "learning_rate": 8.130666666666667e-07, + "loss": 0.0129, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2804 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.6875, + "epoch": 0.374, + "grad_norm": 9.64929485321045, + "kl": 0.2978515625, + "learning_rate": 8.129999999999999e-07, + "loss": 0.0119, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2805 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.375, + "epoch": 0.3741333333333333, + "grad_norm": 53.493778228759766, + "kl": 0.3134765625, + "learning_rate": 8.129333333333333e-07, + "loss": 0.0125, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2806 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.0, + "epoch": 0.3742666666666667, + "grad_norm": 7.803080081939697, + "kl": 0.294921875, + "learning_rate": 8.128666666666666e-07, + "loss": 0.0118, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 2807 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.25, + "epoch": 0.3744, + "grad_norm": 12.044168472290039, + "kl": 0.3515625, + "learning_rate": 8.128e-07, + "loss": 0.0141, + "reward": 1.3125, + "reward_std": 0.6983994543552399, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 2808 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.5625, + "epoch": 0.37453333333333333, + "grad_norm": 12.32894229888916, + "kl": 0.25537109375, + "learning_rate": 8.127333333333333e-07, + "loss": 0.0102, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2809 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.8125, + "epoch": 0.37466666666666665, + "grad_norm": 12.702434539794922, + "kl": 0.474609375, + "learning_rate": 8.126666666666666e-07, + "loss": 0.019, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2810 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.375, + "epoch": 0.3748, + "grad_norm": 7.988397598266602, + "kl": 0.275390625, + "learning_rate": 8.126e-07, + "loss": 0.011, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2811 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.75, + "epoch": 0.37493333333333334, + "grad_norm": 8.861106872558594, + "kl": 0.3896484375, + "learning_rate": 8.125333333333333e-07, + "loss": 0.0156, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2812 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.25, + "epoch": 0.37506666666666666, + "grad_norm": 11.20771312713623, + "kl": 0.537109375, + "learning_rate": 8.124666666666666e-07, + "loss": 0.0215, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2813 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.3125, + "epoch": 0.3752, + "grad_norm": 8.989139556884766, + "kl": 0.392578125, + "learning_rate": 8.123999999999999e-07, + "loss": 0.0157, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 2814 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.375, + "epoch": 0.37533333333333335, + "grad_norm": 10.860733985900879, + "kl": 0.2373046875, + "learning_rate": 8.123333333333333e-07, + "loss": 0.0095, + "reward": 1.5625, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 2815 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.0625, + "epoch": 0.37546666666666667, + "grad_norm": 9.51722526550293, + "kl": 0.390625, + "learning_rate": 8.122666666666666e-07, + "loss": 0.0156, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2816 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.8125, + "epoch": 0.3756, + "grad_norm": 9.37057113647461, + "kl": 0.2490234375, + "learning_rate": 8.122e-07, + "loss": 0.01, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2817 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.0, + "epoch": 0.3757333333333333, + "grad_norm": 5.109945774078369, + "kl": 0.24853515625, + "learning_rate": 8.121333333333333e-07, + "loss": 0.0099, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2818 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.4375, + "epoch": 0.3758666666666667, + "grad_norm": 8.160186767578125, + "kl": 0.18359375, + "learning_rate": 8.120666666666667e-07, + "loss": 0.0073, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2819 + }, + { + "clip_ratio": 0.0, + "completion_length": 44.5625, + "epoch": 0.376, + "grad_norm": 8.073342323303223, + "kl": 0.34423828125, + "learning_rate": 8.12e-07, + "loss": 0.0138, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2820 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.375, + "epoch": 0.3761333333333333, + "grad_norm": 9.085104942321777, + "kl": 0.24462890625, + "learning_rate": 8.119333333333332e-07, + "loss": 0.0098, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2821 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.5625, + "epoch": 0.3762666666666667, + "grad_norm": 12.281721115112305, + "kl": 0.341796875, + "learning_rate": 8.118666666666666e-07, + "loss": 0.0137, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2822 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.375, + "epoch": 0.3764, + "grad_norm": 11.14796257019043, + "kl": 0.44140625, + "learning_rate": 8.117999999999999e-07, + "loss": 0.0177, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2823 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.0, + "epoch": 0.37653333333333333, + "grad_norm": 11.39093017578125, + "kl": 0.3251953125, + "learning_rate": 8.117333333333333e-07, + "loss": 0.013, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2824 + }, + { + "clip_ratio": 0.0, + "completion_length": 48.625, + "epoch": 0.37666666666666665, + "grad_norm": 6.1147541999816895, + "kl": 0.37646484375, + "learning_rate": 8.116666666666666e-07, + "loss": 0.0151, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2825 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.5625, + "epoch": 0.3768, + "grad_norm": 7.025781154632568, + "kl": 0.4970703125, + "learning_rate": 8.116e-07, + "loss": 0.0199, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2826 + }, + { + "clip_ratio": 0.0, + "completion_length": 44.75, + "epoch": 0.37693333333333334, + "grad_norm": 9.424162864685059, + "kl": 0.3466796875, + "learning_rate": 8.115333333333333e-07, + "loss": 0.0139, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2827 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.0, + "epoch": 0.37706666666666666, + "grad_norm": 8.844171524047852, + "kl": 0.232421875, + "learning_rate": 8.114666666666667e-07, + "loss": 0.0093, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2828 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.0625, + "epoch": 0.3772, + "grad_norm": 7.533930778503418, + "kl": 0.23828125, + "learning_rate": 8.114e-07, + "loss": 0.0095, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2829 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.4375, + "epoch": 0.37733333333333335, + "grad_norm": 10.653264045715332, + "kl": 0.24169921875, + "learning_rate": 8.113333333333333e-07, + "loss": 0.0097, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2830 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.5, + "epoch": 0.3774666666666667, + "grad_norm": 10.328964233398438, + "kl": 0.6533203125, + "learning_rate": 8.112666666666667e-07, + "loss": 0.0262, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 2831 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.5625, + "epoch": 0.3776, + "grad_norm": 8.597176551818848, + "kl": 0.3212890625, + "learning_rate": 8.112e-07, + "loss": 0.0129, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2832 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.9375, + "epoch": 0.3777333333333333, + "grad_norm": 12.347657203674316, + "kl": 0.228515625, + "learning_rate": 8.111333333333334e-07, + "loss": 0.0092, + "reward": 1.1875, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 2833 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.0, + "epoch": 0.3778666666666667, + "grad_norm": 7.773655891418457, + "kl": 0.2998046875, + "learning_rate": 8.110666666666667e-07, + "loss": 0.012, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2834 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.0625, + "epoch": 0.378, + "grad_norm": 8.387179374694824, + "kl": 0.2353515625, + "learning_rate": 8.11e-07, + "loss": 0.0094, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2835 + }, + { + "clip_ratio": 0.0, + "completion_length": 42.0, + "epoch": 0.3781333333333333, + "grad_norm": 47.70299530029297, + "kl": 0.3974609375, + "learning_rate": 8.109333333333332e-07, + "loss": 0.0159, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2836 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.625, + "epoch": 0.37826666666666664, + "grad_norm": 0.6990506649017334, + "kl": 0.32421875, + "learning_rate": 8.108666666666666e-07, + "loss": 0.013, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2837 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.5625, + "epoch": 0.3784, + "grad_norm": 10.167439460754395, + "kl": 0.279296875, + "learning_rate": 8.107999999999999e-07, + "loss": 0.0112, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2838 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.5625, + "epoch": 0.37853333333333333, + "grad_norm": 6.310225963592529, + "kl": 0.2314453125, + "learning_rate": 8.107333333333332e-07, + "loss": 0.0092, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2839 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.5625, + "epoch": 0.37866666666666665, + "grad_norm": 6.683408737182617, + "kl": 0.30859375, + "learning_rate": 8.106666666666666e-07, + "loss": 0.0124, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2840 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.8125, + "epoch": 0.3788, + "grad_norm": 19.030319213867188, + "kl": 0.44140625, + "learning_rate": 8.105999999999999e-07, + "loss": 0.0176, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 2841 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.9375, + "epoch": 0.37893333333333334, + "grad_norm": 7.167208671569824, + "kl": 0.2216796875, + "learning_rate": 8.105333333333333e-07, + "loss": 0.0089, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2842 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.4375, + "epoch": 0.37906666666666666, + "grad_norm": 8.140156745910645, + "kl": 0.2666015625, + "learning_rate": 8.104666666666666e-07, + "loss": 0.0106, + "reward": 1.125, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.8125, + "step": 2843 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.25, + "epoch": 0.3792, + "grad_norm": 6.467743873596191, + "kl": 0.2236328125, + "learning_rate": 8.104e-07, + "loss": 0.0089, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2844 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.3125, + "epoch": 0.37933333333333336, + "grad_norm": 7.789659023284912, + "kl": 0.3876953125, + "learning_rate": 8.103333333333333e-07, + "loss": 0.0155, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 2845 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.375, + "epoch": 0.3794666666666667, + "grad_norm": 9.082734107971191, + "kl": 0.22998046875, + "learning_rate": 8.102666666666667e-07, + "loss": 0.0092, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2846 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.0625, + "epoch": 0.3796, + "grad_norm": 8.834443092346191, + "kl": 0.29150390625, + "learning_rate": 8.102e-07, + "loss": 0.0117, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2847 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.75, + "epoch": 0.3797333333333333, + "grad_norm": 6.589456558227539, + "kl": 0.2509765625, + "learning_rate": 8.101333333333334e-07, + "loss": 0.01, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2848 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.0625, + "epoch": 0.3798666666666667, + "grad_norm": 6.6504807472229, + "kl": 0.21337890625, + "learning_rate": 8.100666666666667e-07, + "loss": 0.0085, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2849 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.125, + "epoch": 0.38, + "grad_norm": 6.011994361877441, + "kl": 0.3251953125, + "learning_rate": 8.1e-07, + "loss": 0.013, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2850 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.5625, + "epoch": 0.3801333333333333, + "grad_norm": 7.68787145614624, + "kl": 0.2373046875, + "learning_rate": 8.099333333333333e-07, + "loss": 0.0095, + "reward": 1.3125, + "reward_std": 0.5876962244510651, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 2851 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.25, + "epoch": 0.38026666666666664, + "grad_norm": 5.983354568481445, + "kl": 0.4462890625, + "learning_rate": 8.098666666666666e-07, + "loss": 0.0179, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2852 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.0, + "epoch": 0.3804, + "grad_norm": 14.521173477172852, + "kl": 0.380859375, + "learning_rate": 8.098e-07, + "loss": 0.0152, + "reward": 1.25, + "reward_std": 0.6924468874931335, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.75, + "step": 2853 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.25, + "epoch": 0.38053333333333333, + "grad_norm": 15.165922164916992, + "kl": 0.41796875, + "learning_rate": 8.097333333333333e-07, + "loss": 0.0168, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 2854 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.4375, + "epoch": 0.38066666666666665, + "grad_norm": 0.566997230052948, + "kl": 0.3564453125, + "learning_rate": 8.096666666666667e-07, + "loss": 0.0143, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2855 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.0625, + "epoch": 0.3808, + "grad_norm": 8.744750022888184, + "kl": 0.2734375, + "learning_rate": 8.095999999999999e-07, + "loss": 0.0109, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2856 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.1875, + "epoch": 0.38093333333333335, + "grad_norm": 5.707592964172363, + "kl": 0.3046875, + "learning_rate": 8.095333333333333e-07, + "loss": 0.0122, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2857 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.5625, + "epoch": 0.38106666666666666, + "grad_norm": 6.381083965301514, + "kl": 0.21630859375, + "learning_rate": 8.094666666666666e-07, + "loss": 0.0086, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2858 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.4375, + "epoch": 0.3812, + "grad_norm": 7.545849800109863, + "kl": 0.35546875, + "learning_rate": 8.093999999999999e-07, + "loss": 0.0142, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2859 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.25, + "epoch": 0.38133333333333336, + "grad_norm": 9.23757553100586, + "kl": 0.421875, + "learning_rate": 8.093333333333333e-07, + "loss": 0.0169, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 2860 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.0625, + "epoch": 0.3814666666666667, + "grad_norm": 6.1024322509765625, + "kl": 0.212890625, + "learning_rate": 8.092666666666666e-07, + "loss": 0.0085, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2861 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.1875, + "epoch": 0.3816, + "grad_norm": 9.40796947479248, + "kl": 0.3486328125, + "learning_rate": 8.092e-07, + "loss": 0.0139, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2862 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.5625, + "epoch": 0.3817333333333333, + "grad_norm": 8.84007740020752, + "kl": 0.27587890625, + "learning_rate": 8.091333333333333e-07, + "loss": 0.0111, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 2863 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.6875, + "epoch": 0.3818666666666667, + "grad_norm": 10.271957397460938, + "kl": 0.29052734375, + "learning_rate": 8.090666666666667e-07, + "loss": 0.0116, + "reward": 1.5625, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 2864 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.6875, + "epoch": 0.382, + "grad_norm": 15.230921745300293, + "kl": 0.4296875, + "learning_rate": 8.09e-07, + "loss": 0.0172, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 2865 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.375, + "epoch": 0.3821333333333333, + "grad_norm": 6.513400554656982, + "kl": 0.35693359375, + "learning_rate": 8.089333333333333e-07, + "loss": 0.0143, + "reward": 1.5, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2866 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.4375, + "epoch": 0.38226666666666664, + "grad_norm": 3.547029495239258, + "kl": 0.322265625, + "learning_rate": 8.088666666666666e-07, + "loss": 0.0129, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 2867 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.625, + "epoch": 0.3824, + "grad_norm": 5.6204962730407715, + "kl": 0.22998046875, + "learning_rate": 8.087999999999999e-07, + "loss": 0.0092, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2868 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.125, + "epoch": 0.38253333333333334, + "grad_norm": 9.418163299560547, + "kl": 0.29736328125, + "learning_rate": 8.087333333333333e-07, + "loss": 0.0119, + "reward": 1.3125, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 2869 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.4375, + "epoch": 0.38266666666666665, + "grad_norm": 7.737751007080078, + "kl": 0.31982421875, + "learning_rate": 8.086666666666666e-07, + "loss": 0.0128, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2870 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.3125, + "epoch": 0.3828, + "grad_norm": 10.305850982666016, + "kl": 0.51953125, + "learning_rate": 8.086e-07, + "loss": 0.0207, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 2871 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.0625, + "epoch": 0.38293333333333335, + "grad_norm": 6.822449207305908, + "kl": 0.2607421875, + "learning_rate": 8.085333333333333e-07, + "loss": 0.0104, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2872 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.25, + "epoch": 0.38306666666666667, + "grad_norm": 4.412146091461182, + "kl": 0.3056640625, + "learning_rate": 8.084666666666667e-07, + "loss": 0.0122, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2873 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.9375, + "epoch": 0.3832, + "grad_norm": 6.017822265625, + "kl": 0.181640625, + "learning_rate": 8.084e-07, + "loss": 0.0073, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2874 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.5, + "epoch": 0.38333333333333336, + "grad_norm": 8.833006858825684, + "kl": 0.3935546875, + "learning_rate": 8.083333333333334e-07, + "loss": 0.0157, + "reward": 1.3125, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 2875 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.1875, + "epoch": 0.3834666666666667, + "grad_norm": 6.529056549072266, + "kl": 0.2841796875, + "learning_rate": 8.082666666666667e-07, + "loss": 0.0114, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2876 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.4375, + "epoch": 0.3836, + "grad_norm": 9.47136116027832, + "kl": 0.2626953125, + "learning_rate": 8.081999999999999e-07, + "loss": 0.0105, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2877 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.375, + "epoch": 0.3837333333333333, + "grad_norm": 15.774664878845215, + "kl": 0.4248046875, + "learning_rate": 8.081333333333333e-07, + "loss": 0.017, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2878 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.0625, + "epoch": 0.3838666666666667, + "grad_norm": 5.836161136627197, + "kl": 0.236328125, + "learning_rate": 8.080666666666666e-07, + "loss": 0.0094, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2879 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.125, + "epoch": 0.384, + "grad_norm": 5.06757926940918, + "kl": 0.349609375, + "learning_rate": 8.08e-07, + "loss": 0.014, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 2880 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.8125, + "epoch": 0.3841333333333333, + "grad_norm": 15.154519081115723, + "kl": 0.25244140625, + "learning_rate": 8.079333333333332e-07, + "loss": 0.0101, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2881 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.0625, + "epoch": 0.38426666666666665, + "grad_norm": 7.241763591766357, + "kl": 0.2646484375, + "learning_rate": 8.078666666666666e-07, + "loss": 0.0106, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2882 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.0625, + "epoch": 0.3844, + "grad_norm": 15.201043128967285, + "kl": 0.525390625, + "learning_rate": 8.077999999999999e-07, + "loss": 0.021, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 2883 + }, + { + "clip_ratio": 0.0, + "completion_length": 44.875, + "epoch": 0.38453333333333334, + "grad_norm": 14.03945255279541, + "kl": 0.3662109375, + "learning_rate": 8.077333333333333e-07, + "loss": 0.0146, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2884 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.9375, + "epoch": 0.38466666666666666, + "grad_norm": 9.377949714660645, + "kl": 0.25537109375, + "learning_rate": 8.076666666666666e-07, + "loss": 0.0102, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2885 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.125, + "epoch": 0.3848, + "grad_norm": 9.728574752807617, + "kl": 0.2490234375, + "learning_rate": 8.075999999999999e-07, + "loss": 0.0099, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2886 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.8125, + "epoch": 0.38493333333333335, + "grad_norm": 6.398385524749756, + "kl": 0.21142578125, + "learning_rate": 8.075333333333333e-07, + "loss": 0.0085, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2887 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.125, + "epoch": 0.38506666666666667, + "grad_norm": 5.881159782409668, + "kl": 0.27392578125, + "learning_rate": 8.074666666666666e-07, + "loss": 0.011, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2888 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.5625, + "epoch": 0.3852, + "grad_norm": 9.704279899597168, + "kl": 0.22216796875, + "learning_rate": 8.074e-07, + "loss": 0.0089, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2889 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.9375, + "epoch": 0.38533333333333336, + "grad_norm": 7.087467193603516, + "kl": 0.3505859375, + "learning_rate": 8.073333333333333e-07, + "loss": 0.014, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2890 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.875, + "epoch": 0.3854666666666667, + "grad_norm": 0.8588481545448303, + "kl": 0.43359375, + "learning_rate": 8.072666666666667e-07, + "loss": 0.0173, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2891 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.1875, + "epoch": 0.3856, + "grad_norm": 7.5514373779296875, + "kl": 0.25048828125, + "learning_rate": 8.072e-07, + "loss": 0.01, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 2892 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.625, + "epoch": 0.3857333333333333, + "grad_norm": 7.355426788330078, + "kl": 0.380859375, + "learning_rate": 8.071333333333334e-07, + "loss": 0.0152, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2893 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.375, + "epoch": 0.3858666666666667, + "grad_norm": 7.154294013977051, + "kl": 0.2734375, + "learning_rate": 8.070666666666667e-07, + "loss": 0.0109, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2894 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.375, + "epoch": 0.386, + "grad_norm": 0.3737226128578186, + "kl": 0.24755859375, + "learning_rate": 8.070000000000001e-07, + "loss": 0.0099, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2895 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.375, + "epoch": 0.38613333333333333, + "grad_norm": 14.98223876953125, + "kl": 0.40771484375, + "learning_rate": 8.069333333333333e-07, + "loss": 0.0163, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2896 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.875, + "epoch": 0.38626666666666665, + "grad_norm": 7.970999240875244, + "kl": 0.17822265625, + "learning_rate": 8.068666666666665e-07, + "loss": 0.0071, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2897 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.375, + "epoch": 0.3864, + "grad_norm": 4.439182758331299, + "kl": 0.16259765625, + "learning_rate": 8.067999999999999e-07, + "loss": 0.0065, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2898 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.0, + "epoch": 0.38653333333333334, + "grad_norm": 5.000885486602783, + "kl": 0.22802734375, + "learning_rate": 8.067333333333332e-07, + "loss": 0.0091, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 2899 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.125, + "epoch": 0.38666666666666666, + "grad_norm": 8.662609100341797, + "kl": 0.21533203125, + "learning_rate": 8.066666666666666e-07, + "loss": 0.0086, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2900 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.1875, + "epoch": 0.3868, + "grad_norm": 8.871447563171387, + "kl": 0.2255859375, + "learning_rate": 8.065999999999999e-07, + "loss": 0.009, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2901 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.3125, + "epoch": 0.38693333333333335, + "grad_norm": 8.327580451965332, + "kl": 0.1943359375, + "learning_rate": 8.065333333333333e-07, + "loss": 0.0078, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 2902 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.8125, + "epoch": 0.38706666666666667, + "grad_norm": 8.931221008300781, + "kl": 0.1962890625, + "learning_rate": 8.064666666666666e-07, + "loss": 0.0079, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2903 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.375, + "epoch": 0.3872, + "grad_norm": 20.998620986938477, + "kl": 1.11474609375, + "learning_rate": 8.064e-07, + "loss": 0.0446, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 2904 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.25, + "epoch": 0.3873333333333333, + "grad_norm": 4.428249835968018, + "kl": 0.21142578125, + "learning_rate": 8.063333333333333e-07, + "loss": 0.0085, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2905 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.9375, + "epoch": 0.3874666666666667, + "grad_norm": 5.21307373046875, + "kl": 0.1669921875, + "learning_rate": 8.062666666666666e-07, + "loss": 0.0067, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2906 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.75, + "epoch": 0.3876, + "grad_norm": 5.4450531005859375, + "kl": 0.19482421875, + "learning_rate": 8.062e-07, + "loss": 0.0078, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2907 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.3125, + "epoch": 0.3877333333333333, + "grad_norm": 3.5904617309570312, + "kl": 0.2421875, + "learning_rate": 8.061333333333333e-07, + "loss": 0.0097, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2908 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.4375, + "epoch": 0.3878666666666667, + "grad_norm": 6.602231025695801, + "kl": 0.18994140625, + "learning_rate": 8.060666666666667e-07, + "loss": 0.0076, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2909 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.4375, + "epoch": 0.388, + "grad_norm": 6.536410808563232, + "kl": 0.17822265625, + "learning_rate": 8.06e-07, + "loss": 0.0071, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2910 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.375, + "epoch": 0.38813333333333333, + "grad_norm": 10.02408218383789, + "kl": 0.24658203125, + "learning_rate": 8.059333333333333e-07, + "loss": 0.0099, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2911 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.25, + "epoch": 0.38826666666666665, + "grad_norm": 7.092767715454102, + "kl": 0.15966796875, + "learning_rate": 8.058666666666666e-07, + "loss": 0.0064, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2912 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.9375, + "epoch": 0.3884, + "grad_norm": 7.518425941467285, + "kl": 0.1787109375, + "learning_rate": 8.058e-07, + "loss": 0.0071, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2913 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.5625, + "epoch": 0.38853333333333334, + "grad_norm": 7.983473300933838, + "kl": 0.203125, + "learning_rate": 8.057333333333333e-07, + "loss": 0.0081, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2914 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.3125, + "epoch": 0.38866666666666666, + "grad_norm": 4.117608547210693, + "kl": 0.1533203125, + "learning_rate": 8.056666666666666e-07, + "loss": 0.0061, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2915 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.0625, + "epoch": 0.3888, + "grad_norm": 4.941583633422852, + "kl": 0.2431640625, + "learning_rate": 8.056e-07, + "loss": 0.0097, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2916 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.75, + "epoch": 0.38893333333333335, + "grad_norm": 7.382198810577393, + "kl": 0.3017578125, + "learning_rate": 8.055333333333333e-07, + "loss": 0.0121, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2917 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.8125, + "epoch": 0.38906666666666667, + "grad_norm": 3.920259475708008, + "kl": 0.1533203125, + "learning_rate": 8.054666666666667e-07, + "loss": 0.0061, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2918 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.5, + "epoch": 0.3892, + "grad_norm": 4.6473517417907715, + "kl": 0.2578125, + "learning_rate": 8.053999999999999e-07, + "loss": 0.0103, + "reward": 1.4375, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 2919 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.125, + "epoch": 0.3893333333333333, + "grad_norm": 4.779968738555908, + "kl": 0.17529296875, + "learning_rate": 8.053333333333333e-07, + "loss": 0.007, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2920 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.125, + "epoch": 0.3894666666666667, + "grad_norm": 6.443263530731201, + "kl": 0.17822265625, + "learning_rate": 8.052666666666666e-07, + "loss": 0.0071, + "reward": 1.5625, + "reward_std": 0.8349219560623169, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.8125, + "step": 2921 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.0625, + "epoch": 0.3896, + "grad_norm": 6.100740909576416, + "kl": 0.1943359375, + "learning_rate": 8.052e-07, + "loss": 0.0078, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 2922 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.0625, + "epoch": 0.3897333333333333, + "grad_norm": 14.035297393798828, + "kl": 0.2099609375, + "learning_rate": 8.051333333333333e-07, + "loss": 0.0084, + "reward": 1.625, + "reward_std": 0.7315178513526917, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 2923 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.75, + "epoch": 0.38986666666666664, + "grad_norm": 9.202543258666992, + "kl": 0.19091796875, + "learning_rate": 8.050666666666666e-07, + "loss": 0.0076, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2924 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.6875, + "epoch": 0.39, + "grad_norm": 4.60124397277832, + "kl": 0.213134765625, + "learning_rate": 8.05e-07, + "loss": 0.0085, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2925 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.4375, + "epoch": 0.39013333333333333, + "grad_norm": 4.463189125061035, + "kl": 0.2041015625, + "learning_rate": 8.049333333333332e-07, + "loss": 0.0082, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2926 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.0, + "epoch": 0.39026666666666665, + "grad_norm": 6.324605464935303, + "kl": 0.2021484375, + "learning_rate": 8.048666666666666e-07, + "loss": 0.0081, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2927 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.9375, + "epoch": 0.3904, + "grad_norm": 36.9545783996582, + "kl": 0.18603515625, + "learning_rate": 8.047999999999999e-07, + "loss": 0.0074, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 2928 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.3125, + "epoch": 0.39053333333333334, + "grad_norm": 4.478662967681885, + "kl": 0.1669921875, + "learning_rate": 8.047333333333333e-07, + "loss": 0.0067, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2929 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.625, + "epoch": 0.39066666666666666, + "grad_norm": 7.084095478057861, + "kl": 0.18798828125, + "learning_rate": 8.046666666666666e-07, + "loss": 0.0075, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2930 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.125, + "epoch": 0.3908, + "grad_norm": 5.8718180656433105, + "kl": 0.2041015625, + "learning_rate": 8.046e-07, + "loss": 0.0082, + "reward": 1.0, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 2931 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.6875, + "epoch": 0.39093333333333335, + "grad_norm": 4.388792991638184, + "kl": 0.21728515625, + "learning_rate": 8.045333333333333e-07, + "loss": 0.0087, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2932 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.6875, + "epoch": 0.3910666666666667, + "grad_norm": 8.202240943908691, + "kl": 0.2783203125, + "learning_rate": 8.044666666666666e-07, + "loss": 0.0112, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2933 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.5625, + "epoch": 0.3912, + "grad_norm": 0.46097177267074585, + "kl": 0.20703125, + "learning_rate": 8.044e-07, + "loss": 0.0083, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2934 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.6875, + "epoch": 0.3913333333333333, + "grad_norm": 1.9133914709091187, + "kl": 0.349609375, + "learning_rate": 8.043333333333333e-07, + "loss": 0.014, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2935 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.5625, + "epoch": 0.3914666666666667, + "grad_norm": 7.82487678527832, + "kl": 0.1552734375, + "learning_rate": 8.042666666666667e-07, + "loss": 0.0062, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2936 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.0, + "epoch": 0.3916, + "grad_norm": 4.344534397125244, + "kl": 0.240234375, + "learning_rate": 8.042e-07, + "loss": 0.0096, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2937 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.125, + "epoch": 0.3917333333333333, + "grad_norm": 0.48808732628822327, + "kl": 0.20263671875, + "learning_rate": 8.041333333333334e-07, + "loss": 0.0081, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2938 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.875, + "epoch": 0.39186666666666664, + "grad_norm": 5.511832237243652, + "kl": 0.16748046875, + "learning_rate": 8.040666666666667e-07, + "loss": 0.0067, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2939 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.875, + "epoch": 0.392, + "grad_norm": 7.274435520172119, + "kl": 0.15966796875, + "learning_rate": 8.04e-07, + "loss": 0.0064, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2940 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.4375, + "epoch": 0.39213333333333333, + "grad_norm": 8.528849601745605, + "kl": 0.1884765625, + "learning_rate": 8.039333333333333e-07, + "loss": 0.0076, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2941 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.0, + "epoch": 0.39226666666666665, + "grad_norm": 6.572739601135254, + "kl": 0.2314453125, + "learning_rate": 8.038666666666665e-07, + "loss": 0.0092, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2942 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.875, + "epoch": 0.3924, + "grad_norm": 10.111961364746094, + "kl": 0.30078125, + "learning_rate": 8.037999999999999e-07, + "loss": 0.0121, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2943 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.0625, + "epoch": 0.39253333333333335, + "grad_norm": 5.445520877838135, + "kl": 0.17529296875, + "learning_rate": 8.037333333333332e-07, + "loss": 0.007, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 2944 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.125, + "epoch": 0.39266666666666666, + "grad_norm": 5.439220428466797, + "kl": 0.2333984375, + "learning_rate": 8.036666666666666e-07, + "loss": 0.0094, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2945 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.625, + "epoch": 0.3928, + "grad_norm": 5.4361162185668945, + "kl": 0.298828125, + "learning_rate": 8.035999999999999e-07, + "loss": 0.012, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2946 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.3125, + "epoch": 0.39293333333333336, + "grad_norm": 0.5095065236091614, + "kl": 0.2607421875, + "learning_rate": 8.035333333333333e-07, + "loss": 0.0104, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2947 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.625, + "epoch": 0.3930666666666667, + "grad_norm": 7.111481189727783, + "kl": 0.2861328125, + "learning_rate": 8.034666666666666e-07, + "loss": 0.0115, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2948 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.0625, + "epoch": 0.3932, + "grad_norm": 4.963849067687988, + "kl": 0.171875, + "learning_rate": 8.034e-07, + "loss": 0.0069, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 2949 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.3125, + "epoch": 0.3933333333333333, + "grad_norm": 0.5241984724998474, + "kl": 0.24853515625, + "learning_rate": 8.033333333333333e-07, + "loss": 0.01, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 2950 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.75, + "epoch": 0.3934666666666667, + "grad_norm": 12.931024551391602, + "kl": 0.25146484375, + "learning_rate": 8.032666666666667e-07, + "loss": 0.01, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2951 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.6875, + "epoch": 0.3936, + "grad_norm": 5.019949436187744, + "kl": 0.314453125, + "learning_rate": 8.032e-07, + "loss": 0.0126, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2952 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.5, + "epoch": 0.3937333333333333, + "grad_norm": 9.99043083190918, + "kl": 0.2431640625, + "learning_rate": 8.031333333333333e-07, + "loss": 0.0097, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2953 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.6875, + "epoch": 0.39386666666666664, + "grad_norm": 7.137509346008301, + "kl": 0.2470703125, + "learning_rate": 8.030666666666667e-07, + "loss": 0.0099, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2954 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.125, + "epoch": 0.394, + "grad_norm": 9.380501747131348, + "kl": 0.408203125, + "learning_rate": 8.03e-07, + "loss": 0.0163, + "reward": 1.5625, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 2955 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.625, + "epoch": 0.39413333333333334, + "grad_norm": 8.646175384521484, + "kl": 0.26953125, + "learning_rate": 8.029333333333334e-07, + "loss": 0.0108, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2956 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.625, + "epoch": 0.39426666666666665, + "grad_norm": 7.018808841705322, + "kl": 0.298828125, + "learning_rate": 8.028666666666666e-07, + "loss": 0.0119, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2957 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.3125, + "epoch": 0.3944, + "grad_norm": 9.602266311645508, + "kl": 0.24169921875, + "learning_rate": 8.028e-07, + "loss": 0.0097, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2958 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.375, + "epoch": 0.39453333333333335, + "grad_norm": 6.223363399505615, + "kl": 0.314453125, + "learning_rate": 8.027333333333333e-07, + "loss": 0.0126, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 2959 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.5, + "epoch": 0.39466666666666667, + "grad_norm": 7.665163993835449, + "kl": 0.294921875, + "learning_rate": 8.026666666666667e-07, + "loss": 0.0118, + "reward": 1.375, + "reward_std": 0.7168372869491577, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 2960 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.0625, + "epoch": 0.3948, + "grad_norm": 8.940115928649902, + "kl": 0.2314453125, + "learning_rate": 8.026e-07, + "loss": 0.0093, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 2961 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.5, + "epoch": 0.39493333333333336, + "grad_norm": 6.278318405151367, + "kl": 0.21533203125, + "learning_rate": 8.025333333333332e-07, + "loss": 0.0086, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 2962 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.1875, + "epoch": 0.3950666666666667, + "grad_norm": 8.185022354125977, + "kl": 0.2822265625, + "learning_rate": 8.024666666666666e-07, + "loss": 0.0113, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2963 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.9375, + "epoch": 0.3952, + "grad_norm": 0.35999438166618347, + "kl": 0.25439453125, + "learning_rate": 8.023999999999999e-07, + "loss": 0.0102, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 2964 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.875, + "epoch": 0.3953333333333333, + "grad_norm": 6.374248027801514, + "kl": 0.27490234375, + "learning_rate": 8.023333333333333e-07, + "loss": 0.011, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 2965 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.375, + "epoch": 0.3954666666666667, + "grad_norm": 1.145810842514038, + "kl": 0.28515625, + "learning_rate": 8.022666666666666e-07, + "loss": 0.0114, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2966 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.625, + "epoch": 0.3956, + "grad_norm": 8.21973991394043, + "kl": 0.34033203125, + "learning_rate": 8.022e-07, + "loss": 0.0136, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 2967 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.0, + "epoch": 0.3957333333333333, + "grad_norm": 7.4248881340026855, + "kl": 0.2236328125, + "learning_rate": 8.021333333333333e-07, + "loss": 0.0089, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2968 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.4375, + "epoch": 0.39586666666666664, + "grad_norm": 61.24213409423828, + "kl": 0.3134765625, + "learning_rate": 8.020666666666667e-07, + "loss": 0.0125, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 2969 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.8125, + "epoch": 0.396, + "grad_norm": 5.627874851226807, + "kl": 0.23583984375, + "learning_rate": 8.02e-07, + "loss": 0.0094, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 2970 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.9375, + "epoch": 0.39613333333333334, + "grad_norm": 16.184232711791992, + "kl": 0.353515625, + "learning_rate": 8.019333333333333e-07, + "loss": 0.0141, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2971 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.6875, + "epoch": 0.39626666666666666, + "grad_norm": 0.4380037486553192, + "kl": 0.29443359375, + "learning_rate": 8.018666666666666e-07, + "loss": 0.0118, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2972 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.6875, + "epoch": 0.3964, + "grad_norm": 13.530868530273438, + "kl": 0.3427734375, + "learning_rate": 8.017999999999999e-07, + "loss": 0.0137, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 2973 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.9375, + "epoch": 0.39653333333333335, + "grad_norm": 14.154593467712402, + "kl": 0.462890625, + "learning_rate": 8.017333333333333e-07, + "loss": 0.0185, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 2974 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.1875, + "epoch": 0.39666666666666667, + "grad_norm": 5.4777326583862305, + "kl": 0.22021484375, + "learning_rate": 8.016666666666666e-07, + "loss": 0.0088, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2975 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.125, + "epoch": 0.3968, + "grad_norm": 8.972188949584961, + "kl": 0.2724609375, + "learning_rate": 8.016e-07, + "loss": 0.0109, + "reward": 1.375, + "reward_std": 0.579209566116333, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2976 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.0, + "epoch": 0.39693333333333336, + "grad_norm": 8.852663040161133, + "kl": 0.28173828125, + "learning_rate": 8.015333333333333e-07, + "loss": 0.0113, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2977 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.4375, + "epoch": 0.3970666666666667, + "grad_norm": 8.723069190979004, + "kl": 0.2841796875, + "learning_rate": 8.014666666666667e-07, + "loss": 0.0114, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 2978 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.0, + "epoch": 0.3972, + "grad_norm": 7.394358158111572, + "kl": 0.296875, + "learning_rate": 8.014e-07, + "loss": 0.0119, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 2979 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.3125, + "epoch": 0.3973333333333333, + "grad_norm": 4.5646796226501465, + "kl": 0.21044921875, + "learning_rate": 8.013333333333333e-07, + "loss": 0.0084, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2980 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.9375, + "epoch": 0.3974666666666667, + "grad_norm": 6.469696998596191, + "kl": 0.294921875, + "learning_rate": 8.012666666666667e-07, + "loss": 0.0118, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2981 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.5, + "epoch": 0.3976, + "grad_norm": 42.33256149291992, + "kl": 0.28515625, + "learning_rate": 8.012e-07, + "loss": 0.0114, + "reward": 1.5, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2982 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.0, + "epoch": 0.3977333333333333, + "grad_norm": 8.171295166015625, + "kl": 0.234375, + "learning_rate": 8.011333333333333e-07, + "loss": 0.0094, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2983 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.4375, + "epoch": 0.39786666666666665, + "grad_norm": 8.067204475402832, + "kl": 0.23388671875, + "learning_rate": 8.010666666666666e-07, + "loss": 0.0093, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 2984 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.4375, + "epoch": 0.398, + "grad_norm": 7.743842601776123, + "kl": 0.2998046875, + "learning_rate": 8.01e-07, + "loss": 0.012, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 2985 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.25, + "epoch": 0.39813333333333334, + "grad_norm": 9.45948600769043, + "kl": 0.25244140625, + "learning_rate": 8.009333333333333e-07, + "loss": 0.0101, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 2986 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.375, + "epoch": 0.39826666666666666, + "grad_norm": 4.713519096374512, + "kl": 0.232421875, + "learning_rate": 8.008666666666666e-07, + "loss": 0.0093, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 2987 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.0625, + "epoch": 0.3984, + "grad_norm": 9.487550735473633, + "kl": 0.2705078125, + "learning_rate": 8.007999999999999e-07, + "loss": 0.0108, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.875, + "step": 2988 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.9375, + "epoch": 0.39853333333333335, + "grad_norm": 9.058597564697266, + "kl": 0.3193359375, + "learning_rate": 8.007333333333332e-07, + "loss": 0.0128, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 2989 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.1875, + "epoch": 0.39866666666666667, + "grad_norm": 10.287652969360352, + "kl": 0.2734375, + "learning_rate": 8.006666666666666e-07, + "loss": 0.0109, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 2990 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.25, + "epoch": 0.3988, + "grad_norm": 7.916517734527588, + "kl": 0.25537109375, + "learning_rate": 8.005999999999999e-07, + "loss": 0.0102, + "reward": 1.0625, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 2991 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.4375, + "epoch": 0.3989333333333333, + "grad_norm": 6.5857086181640625, + "kl": 0.359375, + "learning_rate": 8.005333333333333e-07, + "loss": 0.0144, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 2992 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.1875, + "epoch": 0.3990666666666667, + "grad_norm": 0.539369523525238, + "kl": 0.345703125, + "learning_rate": 8.004666666666666e-07, + "loss": 0.0138, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 2993 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.125, + "epoch": 0.3992, + "grad_norm": 7.896547794342041, + "kl": 0.24267578125, + "learning_rate": 8.004e-07, + "loss": 0.0097, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2994 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.5625, + "epoch": 0.3993333333333333, + "grad_norm": 9.404285430908203, + "kl": 0.296875, + "learning_rate": 8.003333333333333e-07, + "loss": 0.0119, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 2995 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.5, + "epoch": 0.3994666666666667, + "grad_norm": 6.015496730804443, + "kl": 0.4052734375, + "learning_rate": 8.002666666666667e-07, + "loss": 0.0162, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2996 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.3125, + "epoch": 0.3996, + "grad_norm": 9.222078323364258, + "kl": 0.2685546875, + "learning_rate": 8.002e-07, + "loss": 0.0108, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 2997 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.9375, + "epoch": 0.39973333333333333, + "grad_norm": 9.387727737426758, + "kl": 0.3466796875, + "learning_rate": 8.001333333333334e-07, + "loss": 0.0139, + "reward": 1.625, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 2998 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.9375, + "epoch": 0.39986666666666665, + "grad_norm": 7.052680492401123, + "kl": 0.1875, + "learning_rate": 8.000666666666667e-07, + "loss": 0.0075, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 2999 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.125, + "epoch": 0.4, + "grad_norm": 7.741370677947998, + "kl": 0.2548828125, + "learning_rate": 8e-07, + "loss": 0.0102, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 3000 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.25, + "epoch": 0.40013333333333334, + "grad_norm": 10.485509872436523, + "kl": 0.212890625, + "learning_rate": 7.999333333333334e-07, + "loss": 0.0085, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3001 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.5625, + "epoch": 0.40026666666666666, + "grad_norm": 1.0174914598464966, + "kl": 0.4228515625, + "learning_rate": 7.998666666666665e-07, + "loss": 0.0169, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3002 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.375, + "epoch": 0.4004, + "grad_norm": 7.612611770629883, + "kl": 0.2958984375, + "learning_rate": 7.998e-07, + "loss": 0.0118, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3003 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.6875, + "epoch": 0.40053333333333335, + "grad_norm": 9.124173164367676, + "kl": 0.4453125, + "learning_rate": 7.997333333333332e-07, + "loss": 0.0178, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 3004 + }, + { + "clip_ratio": 0.0, + "completion_length": 48.25, + "epoch": 0.40066666666666667, + "grad_norm": 5.451950550079346, + "kl": 0.30419921875, + "learning_rate": 7.996666666666666e-07, + "loss": 0.0122, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3005 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.5625, + "epoch": 0.4008, + "grad_norm": 10.49311637878418, + "kl": 0.365234375, + "learning_rate": 7.995999999999999e-07, + "loss": 0.0146, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3006 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.0625, + "epoch": 0.4009333333333333, + "grad_norm": 9.376008987426758, + "kl": 0.28759765625, + "learning_rate": 7.995333333333333e-07, + "loss": 0.0115, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 3007 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.8125, + "epoch": 0.4010666666666667, + "grad_norm": 40.26333236694336, + "kl": 3.15087890625, + "learning_rate": 7.994666666666666e-07, + "loss": 0.1258, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 3008 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.4375, + "epoch": 0.4012, + "grad_norm": 12.48981761932373, + "kl": 0.330078125, + "learning_rate": 7.993999999999999e-07, + "loss": 0.0132, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 3009 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.4375, + "epoch": 0.4013333333333333, + "grad_norm": 7.402318000793457, + "kl": 0.337890625, + "learning_rate": 7.993333333333333e-07, + "loss": 0.0135, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3010 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.0625, + "epoch": 0.4014666666666667, + "grad_norm": 5.325318813323975, + "kl": 0.294921875, + "learning_rate": 7.992666666666666e-07, + "loss": 0.0118, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3011 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.125, + "epoch": 0.4016, + "grad_norm": 9.22677230834961, + "kl": 0.3486328125, + "learning_rate": 7.992e-07, + "loss": 0.0139, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 3012 + }, + { + "clip_ratio": 0.0, + "completion_length": 53.625, + "epoch": 0.40173333333333333, + "grad_norm": 9.996635437011719, + "kl": 0.47265625, + "learning_rate": 7.991333333333333e-07, + "loss": 0.0189, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3013 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.6875, + "epoch": 0.40186666666666665, + "grad_norm": 161.39418029785156, + "kl": 0.3583984375, + "learning_rate": 7.990666666666667e-07, + "loss": 0.0143, + "reward": 1.0, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.875, + "step": 3014 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.0625, + "epoch": 0.402, + "grad_norm": 0.46983054280281067, + "kl": 0.296875, + "learning_rate": 7.99e-07, + "loss": 0.0119, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3015 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.5, + "epoch": 0.40213333333333334, + "grad_norm": 23.55986213684082, + "kl": 0.263671875, + "learning_rate": 7.989333333333334e-07, + "loss": 0.0106, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3016 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.3125, + "epoch": 0.40226666666666666, + "grad_norm": 13.283239364624023, + "kl": 0.294921875, + "learning_rate": 7.988666666666666e-07, + "loss": 0.0118, + "reward": 1.0625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 3017 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.625, + "epoch": 0.4024, + "grad_norm": 6.138981819152832, + "kl": 0.3515625, + "learning_rate": 7.987999999999999e-07, + "loss": 0.0141, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3018 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.25, + "epoch": 0.40253333333333335, + "grad_norm": 9.658381462097168, + "kl": 0.3427734375, + "learning_rate": 7.987333333333333e-07, + "loss": 0.0137, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3019 + }, + { + "clip_ratio": 0.0, + "completion_length": 43.5625, + "epoch": 0.4026666666666667, + "grad_norm": 1.7893215417861938, + "kl": 0.439453125, + "learning_rate": 7.986666666666666e-07, + "loss": 0.0176, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3020 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.75, + "epoch": 0.4028, + "grad_norm": 106.31145477294922, + "kl": 0.2822265625, + "learning_rate": 7.986e-07, + "loss": 0.0113, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3021 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.6875, + "epoch": 0.4029333333333333, + "grad_norm": 11.14863395690918, + "kl": 0.4892578125, + "learning_rate": 7.985333333333333e-07, + "loss": 0.0195, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3022 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.375, + "epoch": 0.4030666666666667, + "grad_norm": 6.7276291847229, + "kl": 0.4208984375, + "learning_rate": 7.984666666666667e-07, + "loss": 0.0169, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3023 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.875, + "epoch": 0.4032, + "grad_norm": 6.750116348266602, + "kl": 0.29833984375, + "learning_rate": 7.984e-07, + "loss": 0.0119, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3024 + }, + { + "clip_ratio": 0.0, + "completion_length": 52.4375, + "epoch": 0.4033333333333333, + "grad_norm": 10.877337455749512, + "kl": 0.3857421875, + "learning_rate": 7.983333333333333e-07, + "loss": 0.0154, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3025 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.625, + "epoch": 0.40346666666666664, + "grad_norm": 0.5678276419639587, + "kl": 0.28759765625, + "learning_rate": 7.982666666666666e-07, + "loss": 0.0115, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3026 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.75, + "epoch": 0.4036, + "grad_norm": 10.3952054977417, + "kl": 0.31396484375, + "learning_rate": 7.981999999999999e-07, + "loss": 0.0126, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3027 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.3125, + "epoch": 0.40373333333333333, + "grad_norm": 13.800960540771484, + "kl": 0.294921875, + "learning_rate": 7.981333333333333e-07, + "loss": 0.0118, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 3028 + }, + { + "clip_ratio": 0.0, + "completion_length": 48.9375, + "epoch": 0.40386666666666665, + "grad_norm": 6.617099761962891, + "kl": 0.46875, + "learning_rate": 7.980666666666666e-07, + "loss": 0.0187, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3029 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.875, + "epoch": 0.404, + "grad_norm": 0.43659746646881104, + "kl": 0.3359375, + "learning_rate": 7.98e-07, + "loss": 0.0135, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 3030 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.875, + "epoch": 0.40413333333333334, + "grad_norm": 5.261110305786133, + "kl": 0.26171875, + "learning_rate": 7.979333333333333e-07, + "loss": 0.0105, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3031 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.0, + "epoch": 0.40426666666666666, + "grad_norm": 6.1956706047058105, + "kl": 0.27392578125, + "learning_rate": 7.978666666666666e-07, + "loss": 0.0109, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 3032 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.875, + "epoch": 0.4044, + "grad_norm": 10.586735725402832, + "kl": 0.29736328125, + "learning_rate": 7.977999999999999e-07, + "loss": 0.0119, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3033 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.9375, + "epoch": 0.40453333333333336, + "grad_norm": 14.562255859375, + "kl": 0.21337890625, + "learning_rate": 7.977333333333333e-07, + "loss": 0.0085, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3034 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.9375, + "epoch": 0.4046666666666667, + "grad_norm": 6.777864456176758, + "kl": 0.2890625, + "learning_rate": 7.976666666666666e-07, + "loss": 0.0116, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3035 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.75, + "epoch": 0.4048, + "grad_norm": 0.35433781147003174, + "kl": 0.22900390625, + "learning_rate": 7.975999999999999e-07, + "loss": 0.0092, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3036 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.125, + "epoch": 0.4049333333333333, + "grad_norm": 5.227321147918701, + "kl": 0.248046875, + "learning_rate": 7.975333333333333e-07, + "loss": 0.0099, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3037 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.875, + "epoch": 0.4050666666666667, + "grad_norm": 4.965774059295654, + "kl": 0.22509765625, + "learning_rate": 7.974666666666666e-07, + "loss": 0.009, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3038 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.9375, + "epoch": 0.4052, + "grad_norm": 8.419178009033203, + "kl": 0.189453125, + "learning_rate": 7.974e-07, + "loss": 0.0076, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3039 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.5, + "epoch": 0.4053333333333333, + "grad_norm": 7.520660877227783, + "kl": 0.2275390625, + "learning_rate": 7.973333333333333e-07, + "loss": 0.0091, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3040 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.4375, + "epoch": 0.40546666666666664, + "grad_norm": 9.697541236877441, + "kl": 0.333984375, + "learning_rate": 7.972666666666667e-07, + "loss": 0.0133, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 3041 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.0625, + "epoch": 0.4056, + "grad_norm": 7.430934906005859, + "kl": 0.2890625, + "learning_rate": 7.972e-07, + "loss": 0.0116, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3042 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.375, + "epoch": 0.40573333333333333, + "grad_norm": 8.788890838623047, + "kl": 0.16162109375, + "learning_rate": 7.971333333333334e-07, + "loss": 0.0065, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3043 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.375, + "epoch": 0.40586666666666665, + "grad_norm": 4.939852237701416, + "kl": 0.32373046875, + "learning_rate": 7.970666666666667e-07, + "loss": 0.013, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3044 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.3125, + "epoch": 0.406, + "grad_norm": 0.4130728840827942, + "kl": 0.19677734375, + "learning_rate": 7.970000000000001e-07, + "loss": 0.0079, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3045 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.5, + "epoch": 0.40613333333333335, + "grad_norm": 0.3413063883781433, + "kl": 0.1806640625, + "learning_rate": 7.969333333333333e-07, + "loss": 0.0072, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3046 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.25, + "epoch": 0.40626666666666666, + "grad_norm": 4.376731872558594, + "kl": 0.2216796875, + "learning_rate": 7.968666666666665e-07, + "loss": 0.0089, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3047 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.875, + "epoch": 0.4064, + "grad_norm": 7.002997398376465, + "kl": 0.20703125, + "learning_rate": 7.967999999999999e-07, + "loss": 0.0083, + "reward": 1.5625, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 3048 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.125, + "epoch": 0.40653333333333336, + "grad_norm": 8.133936882019043, + "kl": 0.20068359375, + "learning_rate": 7.967333333333332e-07, + "loss": 0.008, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3049 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.9375, + "epoch": 0.4066666666666667, + "grad_norm": 4.60258150100708, + "kl": 0.2216796875, + "learning_rate": 7.966666666666666e-07, + "loss": 0.0089, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3050 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.5, + "epoch": 0.4068, + "grad_norm": 7.24094820022583, + "kl": 0.2080078125, + "learning_rate": 7.965999999999999e-07, + "loss": 0.0083, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 3051 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.5625, + "epoch": 0.4069333333333333, + "grad_norm": 4.931095600128174, + "kl": 0.27880859375, + "learning_rate": 7.965333333333333e-07, + "loss": 0.0111, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3052 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.125, + "epoch": 0.4070666666666667, + "grad_norm": 25.38153839111328, + "kl": 0.16259765625, + "learning_rate": 7.964666666666666e-07, + "loss": 0.0065, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3053 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.875, + "epoch": 0.4072, + "grad_norm": 6.653560638427734, + "kl": 0.208984375, + "learning_rate": 7.964e-07, + "loss": 0.0084, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 3054 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.625, + "epoch": 0.4073333333333333, + "grad_norm": 6.431994915008545, + "kl": 0.1611328125, + "learning_rate": 7.963333333333333e-07, + "loss": 0.0065, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3055 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.3125, + "epoch": 0.40746666666666664, + "grad_norm": 7.006518840789795, + "kl": 0.19482421875, + "learning_rate": 7.962666666666666e-07, + "loss": 0.0078, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3056 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.5, + "epoch": 0.4076, + "grad_norm": 7.199805736541748, + "kl": 0.1875, + "learning_rate": 7.962e-07, + "loss": 0.0075, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3057 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.25, + "epoch": 0.40773333333333334, + "grad_norm": 6.123218059539795, + "kl": 0.20654296875, + "learning_rate": 7.961333333333333e-07, + "loss": 0.0083, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 3058 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.9375, + "epoch": 0.40786666666666666, + "grad_norm": 5.283836841583252, + "kl": 0.16015625, + "learning_rate": 7.960666666666667e-07, + "loss": 0.0064, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3059 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.375, + "epoch": 0.408, + "grad_norm": 4.474870204925537, + "kl": 0.220703125, + "learning_rate": 7.96e-07, + "loss": 0.0088, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3060 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.3125, + "epoch": 0.40813333333333335, + "grad_norm": 4.296930313110352, + "kl": 0.24755859375, + "learning_rate": 7.959333333333334e-07, + "loss": 0.0099, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3061 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.75, + "epoch": 0.40826666666666667, + "grad_norm": 7.826120376586914, + "kl": 0.275390625, + "learning_rate": 7.958666666666666e-07, + "loss": 0.011, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 3062 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.5, + "epoch": 0.4084, + "grad_norm": 6.325845241546631, + "kl": 0.18798828125, + "learning_rate": 7.958e-07, + "loss": 0.0075, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3063 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.625, + "epoch": 0.40853333333333336, + "grad_norm": 6.909554481506348, + "kl": 0.2060546875, + "learning_rate": 7.957333333333333e-07, + "loss": 0.0082, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3064 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.6875, + "epoch": 0.4086666666666667, + "grad_norm": 7.649840354919434, + "kl": 0.20068359375, + "learning_rate": 7.956666666666666e-07, + "loss": 0.008, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3065 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.5625, + "epoch": 0.4088, + "grad_norm": 7.585697650909424, + "kl": 0.16650390625, + "learning_rate": 7.956e-07, + "loss": 0.0067, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3066 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.25, + "epoch": 0.4089333333333333, + "grad_norm": 9.825115203857422, + "kl": 0.1806640625, + "learning_rate": 7.955333333333332e-07, + "loss": 0.0072, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 3067 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.5, + "epoch": 0.4090666666666667, + "grad_norm": 8.245543479919434, + "kl": 0.157958984375, + "learning_rate": 7.954666666666666e-07, + "loss": 0.0063, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3068 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.0625, + "epoch": 0.4092, + "grad_norm": 6.33628511428833, + "kl": 0.29052734375, + "learning_rate": 7.953999999999999e-07, + "loss": 0.0116, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3069 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.625, + "epoch": 0.4093333333333333, + "grad_norm": 5.334640026092529, + "kl": 0.3115234375, + "learning_rate": 7.953333333333333e-07, + "loss": 0.0124, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 3070 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.625, + "epoch": 0.40946666666666665, + "grad_norm": 7.552859783172607, + "kl": 0.359375, + "learning_rate": 7.952666666666666e-07, + "loss": 0.0144, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3071 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.5625, + "epoch": 0.4096, + "grad_norm": 7.133388519287109, + "kl": 0.1787109375, + "learning_rate": 7.952e-07, + "loss": 0.0072, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3072 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.5, + "epoch": 0.40973333333333334, + "grad_norm": 4.52786111831665, + "kl": 0.2431640625, + "learning_rate": 7.951333333333333e-07, + "loss": 0.0097, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3073 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.8125, + "epoch": 0.40986666666666666, + "grad_norm": 9.870474815368652, + "kl": 0.21142578125, + "learning_rate": 7.950666666666666e-07, + "loss": 0.0085, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3074 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.5625, + "epoch": 0.41, + "grad_norm": 0.35968175530433655, + "kl": 0.19482421875, + "learning_rate": 7.95e-07, + "loss": 0.0078, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3075 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.375, + "epoch": 0.41013333333333335, + "grad_norm": 6.911659240722656, + "kl": 0.2080078125, + "learning_rate": 7.949333333333333e-07, + "loss": 0.0083, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3076 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.8125, + "epoch": 0.41026666666666667, + "grad_norm": 7.414624214172363, + "kl": 0.2734375, + "learning_rate": 7.948666666666667e-07, + "loss": 0.011, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 3077 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.9375, + "epoch": 0.4104, + "grad_norm": 3.7484304904937744, + "kl": 0.21044921875, + "learning_rate": 7.947999999999999e-07, + "loss": 0.0084, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3078 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.625, + "epoch": 0.4105333333333333, + "grad_norm": 4.488154411315918, + "kl": 0.28955078125, + "learning_rate": 7.947333333333333e-07, + "loss": 0.0116, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 3079 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.0625, + "epoch": 0.4106666666666667, + "grad_norm": 5.979602813720703, + "kl": 0.21240234375, + "learning_rate": 7.946666666666666e-07, + "loss": 0.0085, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3080 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.4375, + "epoch": 0.4108, + "grad_norm": 9.749399185180664, + "kl": 0.35107421875, + "learning_rate": 7.946e-07, + "loss": 0.014, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 3081 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.75, + "epoch": 0.4109333333333333, + "grad_norm": 30.873336791992188, + "kl": 0.29248046875, + "learning_rate": 7.945333333333333e-07, + "loss": 0.0117, + "reward": 1.625, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 3082 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.25, + "epoch": 0.4110666666666667, + "grad_norm": 5.087526798248291, + "kl": 0.22265625, + "learning_rate": 7.944666666666666e-07, + "loss": 0.0089, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3083 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.375, + "epoch": 0.4112, + "grad_norm": 10.585983276367188, + "kl": 0.2841796875, + "learning_rate": 7.944e-07, + "loss": 0.0114, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3084 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.5, + "epoch": 0.41133333333333333, + "grad_norm": 6.369890213012695, + "kl": 0.490234375, + "learning_rate": 7.943333333333333e-07, + "loss": 0.0196, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3085 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.125, + "epoch": 0.41146666666666665, + "grad_norm": 8.217092514038086, + "kl": 0.23291015625, + "learning_rate": 7.942666666666667e-07, + "loss": 0.0093, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3086 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.375, + "epoch": 0.4116, + "grad_norm": 0.26555126905441284, + "kl": 0.1513671875, + "learning_rate": 7.942e-07, + "loss": 0.0061, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3087 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.5, + "epoch": 0.41173333333333334, + "grad_norm": 4.766816139221191, + "kl": 0.21533203125, + "learning_rate": 7.941333333333334e-07, + "loss": 0.0086, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 3088 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.4375, + "epoch": 0.41186666666666666, + "grad_norm": 5.996304035186768, + "kl": 0.228515625, + "learning_rate": 7.940666666666666e-07, + "loss": 0.0091, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3089 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.5625, + "epoch": 0.412, + "grad_norm": 10.037238121032715, + "kl": 0.38818359375, + "learning_rate": 7.94e-07, + "loss": 0.0155, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3090 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.0, + "epoch": 0.41213333333333335, + "grad_norm": 67.0439224243164, + "kl": 2.7548828125, + "learning_rate": 7.939333333333333e-07, + "loss": 0.1103, + "reward": 1.4375, + "reward_std": 0.6739883720874786, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 3091 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.3125, + "epoch": 0.41226666666666667, + "grad_norm": 10.335143089294434, + "kl": 0.22900390625, + "learning_rate": 7.938666666666667e-07, + "loss": 0.0092, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3092 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.8125, + "epoch": 0.4124, + "grad_norm": 0.47199347615242004, + "kl": 0.3642578125, + "learning_rate": 7.937999999999999e-07, + "loss": 0.0146, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3093 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.9375, + "epoch": 0.4125333333333333, + "grad_norm": 8.357438087463379, + "kl": 0.345703125, + "learning_rate": 7.937333333333332e-07, + "loss": 0.0138, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3094 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.5625, + "epoch": 0.4126666666666667, + "grad_norm": 7.760474681854248, + "kl": 0.296875, + "learning_rate": 7.936666666666666e-07, + "loss": 0.0119, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3095 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.6875, + "epoch": 0.4128, + "grad_norm": 5.885244369506836, + "kl": 0.17919921875, + "learning_rate": 7.935999999999999e-07, + "loss": 0.0072, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3096 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.125, + "epoch": 0.4129333333333333, + "grad_norm": 13.161482810974121, + "kl": 0.4541015625, + "learning_rate": 7.935333333333333e-07, + "loss": 0.0182, + "reward": 1.25, + "reward_std": 0.6746576428413391, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 3097 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.8125, + "epoch": 0.4130666666666667, + "grad_norm": 4.998668670654297, + "kl": 0.203125, + "learning_rate": 7.934666666666666e-07, + "loss": 0.0081, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 3098 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.6875, + "epoch": 0.4132, + "grad_norm": 4.558542728424072, + "kl": 0.1767578125, + "learning_rate": 7.934e-07, + "loss": 0.0071, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3099 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.75, + "epoch": 0.41333333333333333, + "grad_norm": 7.102116107940674, + "kl": 0.23095703125, + "learning_rate": 7.933333333333333e-07, + "loss": 0.0092, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 3100 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.5625, + "epoch": 0.41346666666666665, + "grad_norm": 8.517900466918945, + "kl": 0.2353515625, + "learning_rate": 7.932666666666667e-07, + "loss": 0.0094, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3101 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.25, + "epoch": 0.4136, + "grad_norm": 6.319381237030029, + "kl": 0.486328125, + "learning_rate": 7.932e-07, + "loss": 0.0195, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 3102 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.625, + "epoch": 0.41373333333333334, + "grad_norm": 8.103922843933105, + "kl": 0.3662109375, + "learning_rate": 7.931333333333333e-07, + "loss": 0.0146, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3103 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.125, + "epoch": 0.41386666666666666, + "grad_norm": 8.154641151428223, + "kl": 0.30078125, + "learning_rate": 7.930666666666667e-07, + "loss": 0.0121, + "reward": 1.0625, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 3104 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.0625, + "epoch": 0.414, + "grad_norm": 7.813176155090332, + "kl": 0.2890625, + "learning_rate": 7.93e-07, + "loss": 0.0116, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3105 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.25, + "epoch": 0.41413333333333335, + "grad_norm": 8.487632751464844, + "kl": 0.25244140625, + "learning_rate": 7.929333333333334e-07, + "loss": 0.0101, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3106 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.625, + "epoch": 0.41426666666666667, + "grad_norm": 10.003806114196777, + "kl": 0.25830078125, + "learning_rate": 7.928666666666667e-07, + "loss": 0.0103, + "reward": 1.375, + "reward_std": 0.6924468874931335, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 3107 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.3125, + "epoch": 0.4144, + "grad_norm": 5.1701459884643555, + "kl": 0.22900390625, + "learning_rate": 7.928e-07, + "loss": 0.0092, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3108 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.5, + "epoch": 0.4145333333333333, + "grad_norm": 5.6927618980407715, + "kl": 0.32421875, + "learning_rate": 7.927333333333332e-07, + "loss": 0.0129, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3109 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.9375, + "epoch": 0.4146666666666667, + "grad_norm": 9.711394309997559, + "kl": 0.3876953125, + "learning_rate": 7.926666666666666e-07, + "loss": 0.0155, + "reward": 1.6875, + "reward_std": 0.6396867483854294, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.875, + "step": 3110 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.625, + "epoch": 0.4148, + "grad_norm": 0.7863552570343018, + "kl": 0.228515625, + "learning_rate": 7.925999999999999e-07, + "loss": 0.0092, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3111 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.1875, + "epoch": 0.4149333333333333, + "grad_norm": 9.93463134765625, + "kl": 0.21923828125, + "learning_rate": 7.925333333333332e-07, + "loss": 0.0088, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3112 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.125, + "epoch": 0.41506666666666664, + "grad_norm": 5.481283664703369, + "kl": 0.2421875, + "learning_rate": 7.924666666666666e-07, + "loss": 0.0097, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3113 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.375, + "epoch": 0.4152, + "grad_norm": 5.728235244750977, + "kl": 0.25, + "learning_rate": 7.923999999999999e-07, + "loss": 0.01, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3114 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.0, + "epoch": 0.41533333333333333, + "grad_norm": 5.321434497833252, + "kl": 0.2978515625, + "learning_rate": 7.923333333333333e-07, + "loss": 0.0119, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 3115 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.3125, + "epoch": 0.41546666666666665, + "grad_norm": 6.7823028564453125, + "kl": 0.314453125, + "learning_rate": 7.922666666666666e-07, + "loss": 0.0126, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 3116 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.5, + "epoch": 0.4156, + "grad_norm": 5.347249507904053, + "kl": 0.255859375, + "learning_rate": 7.922e-07, + "loss": 0.0102, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3117 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.6875, + "epoch": 0.41573333333333334, + "grad_norm": 6.905178070068359, + "kl": 0.265625, + "learning_rate": 7.921333333333333e-07, + "loss": 0.0106, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3118 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.375, + "epoch": 0.41586666666666666, + "grad_norm": 10.247742652893066, + "kl": 0.2001953125, + "learning_rate": 7.920666666666667e-07, + "loss": 0.008, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3119 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.5, + "epoch": 0.416, + "grad_norm": 5.27961540222168, + "kl": 0.234375, + "learning_rate": 7.92e-07, + "loss": 0.0094, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 3120 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.9375, + "epoch": 0.41613333333333336, + "grad_norm": 7.165731430053711, + "kl": 0.2412109375, + "learning_rate": 7.919333333333333e-07, + "loss": 0.0097, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 3121 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.9375, + "epoch": 0.4162666666666667, + "grad_norm": 4.624126434326172, + "kl": 0.2646484375, + "learning_rate": 7.918666666666667e-07, + "loss": 0.0106, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3122 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.5625, + "epoch": 0.4164, + "grad_norm": 6.24846076965332, + "kl": 0.255859375, + "learning_rate": 7.917999999999999e-07, + "loss": 0.0103, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 3123 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.4375, + "epoch": 0.4165333333333333, + "grad_norm": 4.872045516967773, + "kl": 0.201171875, + "learning_rate": 7.917333333333333e-07, + "loss": 0.0081, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3124 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.5, + "epoch": 0.4166666666666667, + "grad_norm": 40.42748260498047, + "kl": 0.86669921875, + "learning_rate": 7.916666666666666e-07, + "loss": 0.0347, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.6875, + "epoch": 0.4168, + "grad_norm": 5.716443061828613, + "kl": 0.25732421875, + "learning_rate": 7.916e-07, + "loss": 0.0103, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 3126 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.3125, + "epoch": 0.4169333333333333, + "grad_norm": 9.00489330291748, + "kl": 0.30078125, + "learning_rate": 7.915333333333333e-07, + "loss": 0.0121, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3127 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.1875, + "epoch": 0.41706666666666664, + "grad_norm": 5.294042587280273, + "kl": 0.3486328125, + "learning_rate": 7.914666666666667e-07, + "loss": 0.0139, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 3128 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.0625, + "epoch": 0.4172, + "grad_norm": 12.10549545288086, + "kl": 0.20751953125, + "learning_rate": 7.914e-07, + "loss": 0.0083, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3129 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.625, + "epoch": 0.41733333333333333, + "grad_norm": 6.307875156402588, + "kl": 0.3212890625, + "learning_rate": 7.913333333333332e-07, + "loss": 0.0128, + "reward": 1.0, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 3130 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.8125, + "epoch": 0.41746666666666665, + "grad_norm": 6.449913501739502, + "kl": 0.244140625, + "learning_rate": 7.912666666666666e-07, + "loss": 0.0098, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3131 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.5625, + "epoch": 0.4176, + "grad_norm": 9.832964897155762, + "kl": 0.607421875, + "learning_rate": 7.911999999999999e-07, + "loss": 0.0243, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 3132 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.9375, + "epoch": 0.41773333333333335, + "grad_norm": 8.265486717224121, + "kl": 0.251953125, + "learning_rate": 7.911333333333333e-07, + "loss": 0.0101, + "reward": 1.1875, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 3133 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.75, + "epoch": 0.41786666666666666, + "grad_norm": 9.016061782836914, + "kl": 0.2421875, + "learning_rate": 7.910666666666666e-07, + "loss": 0.0097, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.875, + "step": 3134 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.4375, + "epoch": 0.418, + "grad_norm": 5.68404483795166, + "kl": 0.220703125, + "learning_rate": 7.91e-07, + "loss": 0.0088, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3135 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.875, + "epoch": 0.41813333333333336, + "grad_norm": 5.483665466308594, + "kl": 0.255859375, + "learning_rate": 7.909333333333333e-07, + "loss": 0.0102, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3136 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.375, + "epoch": 0.4182666666666667, + "grad_norm": 6.54442834854126, + "kl": 0.25244140625, + "learning_rate": 7.908666666666667e-07, + "loss": 0.0101, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.875, + "step": 3137 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.375, + "epoch": 0.4184, + "grad_norm": 11.91751766204834, + "kl": 0.2333984375, + "learning_rate": 7.907999999999999e-07, + "loss": 0.0093, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 3138 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.625, + "epoch": 0.4185333333333333, + "grad_norm": 6.932803630828857, + "kl": 0.263671875, + "learning_rate": 7.907333333333332e-07, + "loss": 0.0105, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3139 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.625, + "epoch": 0.4186666666666667, + "grad_norm": 35.715789794921875, + "kl": 1.095703125, + "learning_rate": 7.906666666666666e-07, + "loss": 0.0437, + "reward": 1.1875, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 3140 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.8125, + "epoch": 0.4188, + "grad_norm": 4.928835391998291, + "kl": 0.3701171875, + "learning_rate": 7.905999999999999e-07, + "loss": 0.0148, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3141 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.5, + "epoch": 0.4189333333333333, + "grad_norm": 8.667545318603516, + "kl": 0.24072265625, + "learning_rate": 7.905333333333333e-07, + "loss": 0.0096, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3142 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.9375, + "epoch": 0.41906666666666664, + "grad_norm": 8.770484924316406, + "kl": 0.275390625, + "learning_rate": 7.904666666666666e-07, + "loss": 0.011, + "reward": 1.125, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 3143 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.3125, + "epoch": 0.4192, + "grad_norm": 15.188555717468262, + "kl": 0.259765625, + "learning_rate": 7.904e-07, + "loss": 0.0104, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3144 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.875, + "epoch": 0.41933333333333334, + "grad_norm": 6.2787370681762695, + "kl": 0.2490234375, + "learning_rate": 7.903333333333333e-07, + "loss": 0.01, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3145 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.375, + "epoch": 0.41946666666666665, + "grad_norm": 11.1076078414917, + "kl": 0.20703125, + "learning_rate": 7.902666666666667e-07, + "loss": 0.0083, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 3146 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.625, + "epoch": 0.4196, + "grad_norm": 7.936676025390625, + "kl": 0.5810546875, + "learning_rate": 7.902e-07, + "loss": 0.0232, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3147 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.5, + "epoch": 0.41973333333333335, + "grad_norm": 5.088224411010742, + "kl": 0.28076171875, + "learning_rate": 7.901333333333334e-07, + "loss": 0.0112, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3148 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.0, + "epoch": 0.41986666666666667, + "grad_norm": 8.837241172790527, + "kl": 0.28662109375, + "learning_rate": 7.900666666666667e-07, + "loss": 0.0114, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3149 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.375, + "epoch": 0.42, + "grad_norm": 8.603696823120117, + "kl": 0.357421875, + "learning_rate": 7.9e-07, + "loss": 0.0143, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 3150 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.0, + "epoch": 0.42013333333333336, + "grad_norm": 11.579880714416504, + "kl": 0.29736328125, + "learning_rate": 7.899333333333334e-07, + "loss": 0.0119, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3151 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.125, + "epoch": 0.4202666666666667, + "grad_norm": 7.8396759033203125, + "kl": 0.18017578125, + "learning_rate": 7.898666666666666e-07, + "loss": 0.0072, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3152 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.9375, + "epoch": 0.4204, + "grad_norm": 3.668588638305664, + "kl": 0.2578125, + "learning_rate": 7.897999999999999e-07, + "loss": 0.0103, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 3153 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.3125, + "epoch": 0.4205333333333333, + "grad_norm": 3.500211477279663, + "kl": 0.27099609375, + "learning_rate": 7.897333333333332e-07, + "loss": 0.0108, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3154 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.0625, + "epoch": 0.4206666666666667, + "grad_norm": 5.929500102996826, + "kl": 0.23046875, + "learning_rate": 7.896666666666666e-07, + "loss": 0.0092, + "reward": 1.1875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 3155 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.25, + "epoch": 0.4208, + "grad_norm": 9.088095664978027, + "kl": 0.4169921875, + "learning_rate": 7.895999999999999e-07, + "loss": 0.0167, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3156 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.6875, + "epoch": 0.4209333333333333, + "grad_norm": 7.418074131011963, + "kl": 0.2265625, + "learning_rate": 7.895333333333333e-07, + "loss": 0.0091, + "reward": 1.4375, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3157 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.0625, + "epoch": 0.42106666666666664, + "grad_norm": 7.981836318969727, + "kl": 0.21533203125, + "learning_rate": 7.894666666666666e-07, + "loss": 0.0086, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3158 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.5, + "epoch": 0.4212, + "grad_norm": 12.5173921585083, + "kl": 0.1748046875, + "learning_rate": 7.893999999999999e-07, + "loss": 0.007, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3159 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.6875, + "epoch": 0.42133333333333334, + "grad_norm": 9.203743934631348, + "kl": 0.3037109375, + "learning_rate": 7.893333333333333e-07, + "loss": 0.0122, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3160 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.875, + "epoch": 0.42146666666666666, + "grad_norm": 9.087285041809082, + "kl": 0.2958984375, + "learning_rate": 7.892666666666666e-07, + "loss": 0.0118, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3161 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.5, + "epoch": 0.4216, + "grad_norm": 9.781163215637207, + "kl": 0.19384765625, + "learning_rate": 7.892e-07, + "loss": 0.0078, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3162 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.5625, + "epoch": 0.42173333333333335, + "grad_norm": 4.761561870574951, + "kl": 0.333984375, + "learning_rate": 7.891333333333333e-07, + "loss": 0.0133, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 3163 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.8125, + "epoch": 0.42186666666666667, + "grad_norm": 6.5018630027771, + "kl": 0.1826171875, + "learning_rate": 7.890666666666667e-07, + "loss": 0.0073, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3164 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.5, + "epoch": 0.422, + "grad_norm": 7.028570652008057, + "kl": 0.18505859375, + "learning_rate": 7.89e-07, + "loss": 0.0074, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3165 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.25, + "epoch": 0.42213333333333336, + "grad_norm": 7.701554775238037, + "kl": 0.2177734375, + "learning_rate": 7.889333333333334e-07, + "loss": 0.0087, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3166 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.5625, + "epoch": 0.4222666666666667, + "grad_norm": 3.8388824462890625, + "kl": 0.18896484375, + "learning_rate": 7.888666666666667e-07, + "loss": 0.0075, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3167 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.5625, + "epoch": 0.4224, + "grad_norm": 5.119143486022949, + "kl": 0.21484375, + "learning_rate": 7.887999999999999e-07, + "loss": 0.0086, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3168 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.3125, + "epoch": 0.4225333333333333, + "grad_norm": 6.966103553771973, + "kl": 0.302734375, + "learning_rate": 7.887333333333333e-07, + "loss": 0.0122, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 3169 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.25, + "epoch": 0.4226666666666667, + "grad_norm": 5.089352607727051, + "kl": 0.1953125, + "learning_rate": 7.886666666666666e-07, + "loss": 0.0078, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3170 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.9375, + "epoch": 0.4228, + "grad_norm": 15.65630054473877, + "kl": 0.228515625, + "learning_rate": 7.886e-07, + "loss": 0.0091, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3171 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.1875, + "epoch": 0.42293333333333333, + "grad_norm": 5.57004976272583, + "kl": 0.18798828125, + "learning_rate": 7.885333333333332e-07, + "loss": 0.0075, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3172 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.3125, + "epoch": 0.42306666666666665, + "grad_norm": 7.751125812530518, + "kl": 0.30322265625, + "learning_rate": 7.884666666666666e-07, + "loss": 0.0121, + "reward": 1.25, + "reward_std": 0.6452257037162781, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 3173 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.0625, + "epoch": 0.4232, + "grad_norm": 0.5905752182006836, + "kl": 0.3427734375, + "learning_rate": 7.883999999999999e-07, + "loss": 0.0137, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3174 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.0625, + "epoch": 0.42333333333333334, + "grad_norm": 3.8822450637817383, + "kl": 0.31298828125, + "learning_rate": 7.883333333333333e-07, + "loss": 0.0125, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3175 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.5, + "epoch": 0.42346666666666666, + "grad_norm": 6.657088279724121, + "kl": 0.20361328125, + "learning_rate": 7.882666666666666e-07, + "loss": 0.0081, + "reward": 1.1875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 3176 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.375, + "epoch": 0.4236, + "grad_norm": 9.33535385131836, + "kl": 0.16455078125, + "learning_rate": 7.881999999999999e-07, + "loss": 0.0066, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3177 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.375, + "epoch": 0.42373333333333335, + "grad_norm": 8.168123245239258, + "kl": 0.26904296875, + "learning_rate": 7.881333333333333e-07, + "loss": 0.0108, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3178 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.25, + "epoch": 0.42386666666666667, + "grad_norm": 34.33900451660156, + "kl": 0.228515625, + "learning_rate": 7.880666666666666e-07, + "loss": 0.0092, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3179 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.5, + "epoch": 0.424, + "grad_norm": 5.054474830627441, + "kl": 0.17822265625, + "learning_rate": 7.88e-07, + "loss": 0.0071, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3180 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.125, + "epoch": 0.4241333333333333, + "grad_norm": 6.5336012840271, + "kl": 0.1591796875, + "learning_rate": 7.879333333333333e-07, + "loss": 0.0064, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3181 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.5, + "epoch": 0.4242666666666667, + "grad_norm": 7.033720970153809, + "kl": 0.21875, + "learning_rate": 7.878666666666667e-07, + "loss": 0.0087, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3182 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.9375, + "epoch": 0.4244, + "grad_norm": 4.253870010375977, + "kl": 0.2333984375, + "learning_rate": 7.877999999999999e-07, + "loss": 0.0094, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3183 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.625, + "epoch": 0.4245333333333333, + "grad_norm": 5.30000114440918, + "kl": 0.1689453125, + "learning_rate": 7.877333333333333e-07, + "loss": 0.0067, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 3184 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.6875, + "epoch": 0.4246666666666667, + "grad_norm": 7.982974052429199, + "kl": 0.19970703125, + "learning_rate": 7.876666666666666e-07, + "loss": 0.008, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3185 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.0, + "epoch": 0.4248, + "grad_norm": 6.23579216003418, + "kl": 0.20166015625, + "learning_rate": 7.875999999999999e-07, + "loss": 0.0081, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3186 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.8125, + "epoch": 0.42493333333333333, + "grad_norm": 9.314181327819824, + "kl": 0.1572265625, + "learning_rate": 7.875333333333333e-07, + "loss": 0.0063, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3187 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.6875, + "epoch": 0.42506666666666665, + "grad_norm": 5.510793209075928, + "kl": 0.162109375, + "learning_rate": 7.874666666666666e-07, + "loss": 0.0065, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3188 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.375, + "epoch": 0.4252, + "grad_norm": 6.379396915435791, + "kl": 0.15673828125, + "learning_rate": 7.874e-07, + "loss": 0.0063, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3189 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.8125, + "epoch": 0.42533333333333334, + "grad_norm": 113.5605239868164, + "kl": 6.58056640625, + "learning_rate": 7.873333333333333e-07, + "loss": 0.2641, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 3190 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.0625, + "epoch": 0.42546666666666666, + "grad_norm": 8.596539497375488, + "kl": 0.2314453125, + "learning_rate": 7.872666666666667e-07, + "loss": 0.0093, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 3191 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.9375, + "epoch": 0.4256, + "grad_norm": 5.029439926147461, + "kl": 0.341796875, + "learning_rate": 7.872e-07, + "loss": 0.0137, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3192 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.0625, + "epoch": 0.42573333333333335, + "grad_norm": 4.739677906036377, + "kl": 0.21923828125, + "learning_rate": 7.871333333333334e-07, + "loss": 0.0088, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3193 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.0, + "epoch": 0.42586666666666667, + "grad_norm": 8.041901588439941, + "kl": 0.18212890625, + "learning_rate": 7.870666666666666e-07, + "loss": 0.0073, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3194 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.0, + "epoch": 0.426, + "grad_norm": 5.569284915924072, + "kl": 0.2939453125, + "learning_rate": 7.87e-07, + "loss": 0.0118, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 3195 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.125, + "epoch": 0.4261333333333333, + "grad_norm": 10.583728790283203, + "kl": 0.4921875, + "learning_rate": 7.869333333333333e-07, + "loss": 0.0197, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.875, + "step": 3196 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.4375, + "epoch": 0.4262666666666667, + "grad_norm": 7.473590850830078, + "kl": 0.2900390625, + "learning_rate": 7.868666666666666e-07, + "loss": 0.0116, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 3197 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.125, + "epoch": 0.4264, + "grad_norm": 10.037421226501465, + "kl": 0.25390625, + "learning_rate": 7.868e-07, + "loss": 0.0102, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3198 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.375, + "epoch": 0.4265333333333333, + "grad_norm": 4.907419204711914, + "kl": 0.22216796875, + "learning_rate": 7.867333333333332e-07, + "loss": 0.0089, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3199 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.125, + "epoch": 0.4266666666666667, + "grad_norm": 6.558238506317139, + "kl": 0.2216796875, + "learning_rate": 7.866666666666666e-07, + "loss": 0.0089, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3200 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.75, + "epoch": 0.4268, + "grad_norm": 11.971597671508789, + "kl": 0.35595703125, + "learning_rate": 7.865999999999999e-07, + "loss": 0.0143, + "reward": 1.25, + "reward_std": 0.579209566116333, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 3201 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.9375, + "epoch": 0.42693333333333333, + "grad_norm": 6.353662490844727, + "kl": 0.29443359375, + "learning_rate": 7.865333333333333e-07, + "loss": 0.0118, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3202 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.9375, + "epoch": 0.42706666666666665, + "grad_norm": 6.851166725158691, + "kl": 0.193359375, + "learning_rate": 7.864666666666666e-07, + "loss": 0.0077, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3203 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.75, + "epoch": 0.4272, + "grad_norm": 9.936026573181152, + "kl": 0.232421875, + "learning_rate": 7.864e-07, + "loss": 0.0093, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3204 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.75, + "epoch": 0.42733333333333334, + "grad_norm": 29.773775100708008, + "kl": 0.97802734375, + "learning_rate": 7.863333333333333e-07, + "loss": 0.0393, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3205 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.6875, + "epoch": 0.42746666666666666, + "grad_norm": 3.8904526233673096, + "kl": 0.15625, + "learning_rate": 7.862666666666666e-07, + "loss": 0.0062, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3206 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.875, + "epoch": 0.4276, + "grad_norm": 17.534523010253906, + "kl": 0.4404296875, + "learning_rate": 7.862e-07, + "loss": 0.0176, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 3207 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.5625, + "epoch": 0.42773333333333335, + "grad_norm": 6.539990425109863, + "kl": 0.27734375, + "learning_rate": 7.861333333333333e-07, + "loss": 0.0111, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3208 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.0625, + "epoch": 0.4278666666666667, + "grad_norm": 0.7903090119361877, + "kl": 0.26318359375, + "learning_rate": 7.860666666666667e-07, + "loss": 0.0105, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3209 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.8125, + "epoch": 0.428, + "grad_norm": 15.872941970825195, + "kl": 0.25341796875, + "learning_rate": 7.86e-07, + "loss": 0.0101, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3210 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.75, + "epoch": 0.4281333333333333, + "grad_norm": 6.873401641845703, + "kl": 0.2783203125, + "learning_rate": 7.859333333333334e-07, + "loss": 0.0111, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 3211 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.0, + "epoch": 0.4282666666666667, + "grad_norm": 5.439950466156006, + "kl": 0.2412109375, + "learning_rate": 7.858666666666667e-07, + "loss": 0.0096, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3212 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.0, + "epoch": 0.4284, + "grad_norm": 9.119206428527832, + "kl": 0.19677734375, + "learning_rate": 7.858000000000001e-07, + "loss": 0.0079, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3213 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.625, + "epoch": 0.4285333333333333, + "grad_norm": 39.34450912475586, + "kl": 0.4033203125, + "learning_rate": 7.857333333333332e-07, + "loss": 0.0161, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3214 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.5, + "epoch": 0.42866666666666664, + "grad_norm": 25.065107345581055, + "kl": 0.2724609375, + "learning_rate": 7.856666666666665e-07, + "loss": 0.0109, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3215 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.25, + "epoch": 0.4288, + "grad_norm": 6.076460838317871, + "kl": 0.2490234375, + "learning_rate": 7.855999999999999e-07, + "loss": 0.0099, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3216 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.6875, + "epoch": 0.42893333333333333, + "grad_norm": 7.374367713928223, + "kl": 0.287109375, + "learning_rate": 7.855333333333332e-07, + "loss": 0.0115, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3217 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.4375, + "epoch": 0.42906666666666665, + "grad_norm": 5.019228935241699, + "kl": 0.13720703125, + "learning_rate": 7.854666666666666e-07, + "loss": 0.0055, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3218 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.9375, + "epoch": 0.4292, + "grad_norm": 7.768800258636475, + "kl": 0.23046875, + "learning_rate": 7.853999999999999e-07, + "loss": 0.0092, + "reward": 1.625, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 3219 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.3125, + "epoch": 0.42933333333333334, + "grad_norm": 8.583407402038574, + "kl": 0.361328125, + "learning_rate": 7.853333333333333e-07, + "loss": 0.0144, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 3220 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.875, + "epoch": 0.42946666666666666, + "grad_norm": 4.687839508056641, + "kl": 0.1875, + "learning_rate": 7.852666666666666e-07, + "loss": 0.0075, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3221 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.5625, + "epoch": 0.4296, + "grad_norm": 5.025930404663086, + "kl": 0.13671875, + "learning_rate": 7.852e-07, + "loss": 0.0055, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3222 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.8125, + "epoch": 0.42973333333333336, + "grad_norm": 8.882380485534668, + "kl": 0.2451171875, + "learning_rate": 7.851333333333333e-07, + "loss": 0.0098, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3223 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.75, + "epoch": 0.4298666666666667, + "grad_norm": 6.064931869506836, + "kl": 0.162109375, + "learning_rate": 7.850666666666666e-07, + "loss": 0.0065, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3224 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.9375, + "epoch": 0.43, + "grad_norm": 6.16487979888916, + "kl": 0.26025390625, + "learning_rate": 7.85e-07, + "loss": 0.0104, + "reward": 1.1875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 3225 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.9375, + "epoch": 0.4301333333333333, + "grad_norm": 4.781462669372559, + "kl": 0.1904296875, + "learning_rate": 7.849333333333333e-07, + "loss": 0.0076, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3226 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.0, + "epoch": 0.4302666666666667, + "grad_norm": 4.79668664932251, + "kl": 0.2197265625, + "learning_rate": 7.848666666666667e-07, + "loss": 0.0088, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 3227 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.75, + "epoch": 0.4304, + "grad_norm": 10.060317993164062, + "kl": 0.26953125, + "learning_rate": 7.848e-07, + "loss": 0.0108, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3228 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.25, + "epoch": 0.4305333333333333, + "grad_norm": 5.262172698974609, + "kl": 0.29345703125, + "learning_rate": 7.847333333333333e-07, + "loss": 0.0117, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 3229 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.9375, + "epoch": 0.43066666666666664, + "grad_norm": 7.379050254821777, + "kl": 0.392578125, + "learning_rate": 7.846666666666666e-07, + "loss": 0.0157, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.875, + "step": 3230 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.1875, + "epoch": 0.4308, + "grad_norm": 6.476879119873047, + "kl": 0.1806640625, + "learning_rate": 7.846e-07, + "loss": 0.0072, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3231 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.8125, + "epoch": 0.43093333333333333, + "grad_norm": 14.073518753051758, + "kl": 0.791015625, + "learning_rate": 7.845333333333333e-07, + "loss": 0.0316, + "reward": 1.1875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 3232 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.5, + "epoch": 0.43106666666666665, + "grad_norm": 7.060639381408691, + "kl": 0.177734375, + "learning_rate": 7.844666666666666e-07, + "loss": 0.0071, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 3233 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.625, + "epoch": 0.4312, + "grad_norm": 5.647806644439697, + "kl": 0.22119140625, + "learning_rate": 7.844e-07, + "loss": 0.0089, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3234 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.0625, + "epoch": 0.43133333333333335, + "grad_norm": 14.897685050964355, + "kl": 0.1865234375, + "learning_rate": 7.843333333333332e-07, + "loss": 0.0075, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3235 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.1875, + "epoch": 0.43146666666666667, + "grad_norm": 6.430875301361084, + "kl": 0.34765625, + "learning_rate": 7.842666666666666e-07, + "loss": 0.0139, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 3236 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.625, + "epoch": 0.4316, + "grad_norm": 9.569488525390625, + "kl": 0.259765625, + "learning_rate": 7.841999999999999e-07, + "loss": 0.0104, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3237 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.625, + "epoch": 0.43173333333333336, + "grad_norm": 8.870147705078125, + "kl": 0.263671875, + "learning_rate": 7.841333333333333e-07, + "loss": 0.0105, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3238 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.8125, + "epoch": 0.4318666666666667, + "grad_norm": 8.13780689239502, + "kl": 0.19287109375, + "learning_rate": 7.840666666666666e-07, + "loss": 0.0077, + "reward": 1.375, + "reward_std": 0.579209566116333, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3239 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.4375, + "epoch": 0.432, + "grad_norm": 19.266008377075195, + "kl": 0.1806640625, + "learning_rate": 7.84e-07, + "loss": 0.0072, + "reward": 1.625, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 3240 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.0625, + "epoch": 0.4321333333333333, + "grad_norm": 9.066024780273438, + "kl": 0.224609375, + "learning_rate": 7.839333333333333e-07, + "loss": 0.009, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3241 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.1875, + "epoch": 0.4322666666666667, + "grad_norm": 8.017587661743164, + "kl": 0.2353515625, + "learning_rate": 7.838666666666667e-07, + "loss": 0.0094, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3242 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.5625, + "epoch": 0.4324, + "grad_norm": 0.3260616660118103, + "kl": 0.16748046875, + "learning_rate": 7.838e-07, + "loss": 0.0067, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3243 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.25, + "epoch": 0.4325333333333333, + "grad_norm": 6.1689887046813965, + "kl": 0.2431640625, + "learning_rate": 7.837333333333332e-07, + "loss": 0.0097, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3244 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.4375, + "epoch": 0.43266666666666664, + "grad_norm": 4.69876766204834, + "kl": 0.27783203125, + "learning_rate": 7.836666666666666e-07, + "loss": 0.0111, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3245 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.25, + "epoch": 0.4328, + "grad_norm": 9.612675666809082, + "kl": 0.25927734375, + "learning_rate": 7.835999999999999e-07, + "loss": 0.0104, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3246 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.3125, + "epoch": 0.43293333333333334, + "grad_norm": 8.580057144165039, + "kl": 0.22314453125, + "learning_rate": 7.835333333333333e-07, + "loss": 0.0089, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 3247 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.0, + "epoch": 0.43306666666666666, + "grad_norm": 4.602025032043457, + "kl": 0.13916015625, + "learning_rate": 7.834666666666666e-07, + "loss": 0.0056, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3248 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.0, + "epoch": 0.4332, + "grad_norm": 4.906609535217285, + "kl": 0.21484375, + "learning_rate": 7.834e-07, + "loss": 0.0086, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3249 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.0625, + "epoch": 0.43333333333333335, + "grad_norm": 31.83833885192871, + "kl": 0.2509765625, + "learning_rate": 7.833333333333333e-07, + "loss": 0.01, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3250 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.0, + "epoch": 0.43346666666666667, + "grad_norm": 7.159491539001465, + "kl": 0.2109375, + "learning_rate": 7.832666666666667e-07, + "loss": 0.0084, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3251 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.25, + "epoch": 0.4336, + "grad_norm": 7.838438987731934, + "kl": 0.244140625, + "learning_rate": 7.832e-07, + "loss": 0.0098, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 3252 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.125, + "epoch": 0.43373333333333336, + "grad_norm": 12.100494384765625, + "kl": 0.28076171875, + "learning_rate": 7.831333333333333e-07, + "loss": 0.0112, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3253 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.5, + "epoch": 0.4338666666666667, + "grad_norm": 8.859332084655762, + "kl": 0.203125, + "learning_rate": 7.830666666666667e-07, + "loss": 0.0081, + "reward": 1.125, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 3254 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.4375, + "epoch": 0.434, + "grad_norm": 40.767757415771484, + "kl": 0.21826171875, + "learning_rate": 7.83e-07, + "loss": 0.0087, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3255 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.25, + "epoch": 0.4341333333333333, + "grad_norm": 14.59780502319336, + "kl": 0.21240234375, + "learning_rate": 7.829333333333334e-07, + "loss": 0.0085, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3256 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.5, + "epoch": 0.4342666666666667, + "grad_norm": 5.10127067565918, + "kl": 0.2490234375, + "learning_rate": 7.828666666666666e-07, + "loss": 0.01, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3257 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.8125, + "epoch": 0.4344, + "grad_norm": 5.258898735046387, + "kl": 0.1630859375, + "learning_rate": 7.828e-07, + "loss": 0.0065, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3258 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.3125, + "epoch": 0.4345333333333333, + "grad_norm": 9.1240234375, + "kl": 0.3193359375, + "learning_rate": 7.827333333333332e-07, + "loss": 0.0128, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3259 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.1875, + "epoch": 0.43466666666666665, + "grad_norm": 3.2706708908081055, + "kl": 0.283203125, + "learning_rate": 7.826666666666666e-07, + "loss": 0.0113, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 3260 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.125, + "epoch": 0.4348, + "grad_norm": 7.824070930480957, + "kl": 0.234375, + "learning_rate": 7.825999999999999e-07, + "loss": 0.0094, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3261 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.125, + "epoch": 0.43493333333333334, + "grad_norm": 7.308884143829346, + "kl": 0.24365234375, + "learning_rate": 7.825333333333332e-07, + "loss": 0.0097, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 3262 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.1875, + "epoch": 0.43506666666666666, + "grad_norm": 7.215539932250977, + "kl": 0.2802734375, + "learning_rate": 7.824666666666666e-07, + "loss": 0.0112, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 3263 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.375, + "epoch": 0.4352, + "grad_norm": 26.465808868408203, + "kl": 0.16796875, + "learning_rate": 7.823999999999999e-07, + "loss": 0.0067, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 3264 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.9375, + "epoch": 0.43533333333333335, + "grad_norm": 7.136651515960693, + "kl": 0.2021484375, + "learning_rate": 7.823333333333333e-07, + "loss": 0.0081, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3265 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.625, + "epoch": 0.43546666666666667, + "grad_norm": 7.8048200607299805, + "kl": 0.19140625, + "learning_rate": 7.822666666666666e-07, + "loss": 0.0076, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3266 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.5625, + "epoch": 0.4356, + "grad_norm": 6.615394592285156, + "kl": 0.19775390625, + "learning_rate": 7.822e-07, + "loss": 0.0079, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 3267 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.625, + "epoch": 0.4357333333333333, + "grad_norm": 7.09544563293457, + "kl": 0.26123046875, + "learning_rate": 7.821333333333333e-07, + "loss": 0.0104, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3268 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.375, + "epoch": 0.4358666666666667, + "grad_norm": 0.5682937502861023, + "kl": 0.189453125, + "learning_rate": 7.820666666666667e-07, + "loss": 0.0076, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 3269 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.75, + "epoch": 0.436, + "grad_norm": 12.600449562072754, + "kl": 0.17333984375, + "learning_rate": 7.82e-07, + "loss": 0.0069, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3270 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.5, + "epoch": 0.4361333333333333, + "grad_norm": 8.412938117980957, + "kl": 0.255859375, + "learning_rate": 7.819333333333333e-07, + "loss": 0.0102, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3271 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.75, + "epoch": 0.4362666666666667, + "grad_norm": 5.421636581420898, + "kl": 0.197265625, + "learning_rate": 7.818666666666667e-07, + "loss": 0.0079, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3272 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.125, + "epoch": 0.4364, + "grad_norm": 12.28888988494873, + "kl": 0.2607421875, + "learning_rate": 7.818e-07, + "loss": 0.0104, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3273 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.875, + "epoch": 0.43653333333333333, + "grad_norm": 9.15059757232666, + "kl": 0.19970703125, + "learning_rate": 7.817333333333333e-07, + "loss": 0.008, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3274 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.625, + "epoch": 0.43666666666666665, + "grad_norm": 7.0355682373046875, + "kl": 0.3251953125, + "learning_rate": 7.816666666666666e-07, + "loss": 0.013, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3275 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.3125, + "epoch": 0.4368, + "grad_norm": 5.146827220916748, + "kl": 0.18994140625, + "learning_rate": 7.816e-07, + "loss": 0.0076, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 3276 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.6875, + "epoch": 0.43693333333333334, + "grad_norm": 8.676867485046387, + "kl": 0.17626953125, + "learning_rate": 7.815333333333333e-07, + "loss": 0.007, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3277 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.875, + "epoch": 0.43706666666666666, + "grad_norm": 6.454319953918457, + "kl": 0.263916015625, + "learning_rate": 7.814666666666666e-07, + "loss": 0.0105, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3278 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.3125, + "epoch": 0.4372, + "grad_norm": 5.617560386657715, + "kl": 0.337890625, + "learning_rate": 7.813999999999999e-07, + "loss": 0.0135, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3279 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.125, + "epoch": 0.43733333333333335, + "grad_norm": 3.903597354888916, + "kl": 0.185546875, + "learning_rate": 7.813333333333332e-07, + "loss": 0.0074, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3280 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.4375, + "epoch": 0.43746666666666667, + "grad_norm": 8.774321556091309, + "kl": 0.20361328125, + "learning_rate": 7.812666666666666e-07, + "loss": 0.0081, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3281 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.75, + "epoch": 0.4376, + "grad_norm": 4.21595573425293, + "kl": 0.17724609375, + "learning_rate": 7.811999999999999e-07, + "loss": 0.0071, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3282 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.6875, + "epoch": 0.4377333333333333, + "grad_norm": 5.002242565155029, + "kl": 0.1640625, + "learning_rate": 7.811333333333333e-07, + "loss": 0.0066, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3283 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.125, + "epoch": 0.4378666666666667, + "grad_norm": 7.856531143188477, + "kl": 0.19921875, + "learning_rate": 7.810666666666666e-07, + "loss": 0.008, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3284 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.125, + "epoch": 0.438, + "grad_norm": 6.902761459350586, + "kl": 0.2275390625, + "learning_rate": 7.81e-07, + "loss": 0.0091, + "reward": 1.6875, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 3285 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.1875, + "epoch": 0.4381333333333333, + "grad_norm": 6.41436767578125, + "kl": 0.2294921875, + "learning_rate": 7.809333333333333e-07, + "loss": 0.0092, + "reward": 1.4375, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 3286 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.0625, + "epoch": 0.4382666666666667, + "grad_norm": 5.324419021606445, + "kl": 0.3251953125, + "learning_rate": 7.808666666666667e-07, + "loss": 0.013, + "reward": 1.6875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 3287 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.375, + "epoch": 0.4384, + "grad_norm": 17.450925827026367, + "kl": 0.26318359375, + "learning_rate": 7.808e-07, + "loss": 0.0105, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 3288 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.0, + "epoch": 0.43853333333333333, + "grad_norm": 9.010656356811523, + "kl": 0.2158203125, + "learning_rate": 7.807333333333333e-07, + "loss": 0.0086, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3289 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.375, + "epoch": 0.43866666666666665, + "grad_norm": 8.827290534973145, + "kl": 0.2509765625, + "learning_rate": 7.806666666666666e-07, + "loss": 0.01, + "reward": 1.375, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 3290 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.3125, + "epoch": 0.4388, + "grad_norm": 6.626047134399414, + "kl": 0.18798828125, + "learning_rate": 7.805999999999999e-07, + "loss": 0.0075, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3291 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.5625, + "epoch": 0.43893333333333334, + "grad_norm": 7.061432361602783, + "kl": 0.16845703125, + "learning_rate": 7.805333333333333e-07, + "loss": 0.0067, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 3292 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.8125, + "epoch": 0.43906666666666666, + "grad_norm": 8.518251419067383, + "kl": 0.259765625, + "learning_rate": 7.804666666666666e-07, + "loss": 0.0104, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3293 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.4375, + "epoch": 0.4392, + "grad_norm": 8.274791717529297, + "kl": 0.2734375, + "learning_rate": 7.804e-07, + "loss": 0.0109, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3294 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.625, + "epoch": 0.43933333333333335, + "grad_norm": 11.670238494873047, + "kl": 0.30908203125, + "learning_rate": 7.803333333333333e-07, + "loss": 0.0124, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.875, + "step": 3295 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.75, + "epoch": 0.43946666666666667, + "grad_norm": 5.0619940757751465, + "kl": 0.27197265625, + "learning_rate": 7.802666666666667e-07, + "loss": 0.0109, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 3296 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.0625, + "epoch": 0.4396, + "grad_norm": 4.993666172027588, + "kl": 0.31494140625, + "learning_rate": 7.802e-07, + "loss": 0.0126, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3297 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.375, + "epoch": 0.4397333333333333, + "grad_norm": 8.044970512390137, + "kl": 0.17626953125, + "learning_rate": 7.801333333333334e-07, + "loss": 0.0071, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3298 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.0, + "epoch": 0.4398666666666667, + "grad_norm": 10.301861763000488, + "kl": 0.2978515625, + "learning_rate": 7.800666666666667e-07, + "loss": 0.0119, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3299 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.125, + "epoch": 0.44, + "grad_norm": 7.292743682861328, + "kl": 0.3076171875, + "learning_rate": 7.799999999999999e-07, + "loss": 0.0123, + "reward": 1.3125, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 3300 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.4375, + "epoch": 0.4401333333333333, + "grad_norm": 8.743672370910645, + "kl": 0.2880859375, + "learning_rate": 7.799333333333333e-07, + "loss": 0.0115, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3301 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.0625, + "epoch": 0.44026666666666664, + "grad_norm": 5.373223781585693, + "kl": 0.1845703125, + "learning_rate": 7.798666666666666e-07, + "loss": 0.0074, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3302 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.9375, + "epoch": 0.4404, + "grad_norm": 6.38722562789917, + "kl": 0.306640625, + "learning_rate": 7.798e-07, + "loss": 0.0123, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 3303 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.75, + "epoch": 0.44053333333333333, + "grad_norm": 11.763447761535645, + "kl": 0.240234375, + "learning_rate": 7.797333333333332e-07, + "loss": 0.0096, + "reward": 1.375, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 3304 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.0, + "epoch": 0.44066666666666665, + "grad_norm": 13.054798126220703, + "kl": 0.19384765625, + "learning_rate": 7.796666666666666e-07, + "loss": 0.0077, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3305 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.3125, + "epoch": 0.4408, + "grad_norm": 9.412934303283691, + "kl": 0.328125, + "learning_rate": 7.795999999999999e-07, + "loss": 0.0131, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.875, + "step": 3306 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.5, + "epoch": 0.44093333333333334, + "grad_norm": 0.4833342432975769, + "kl": 0.2783203125, + "learning_rate": 7.795333333333333e-07, + "loss": 0.0111, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3307 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.4375, + "epoch": 0.44106666666666666, + "grad_norm": 59.08681869506836, + "kl": 0.2529296875, + "learning_rate": 7.794666666666666e-07, + "loss": 0.0101, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3308 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.1875, + "epoch": 0.4412, + "grad_norm": 7.976446151733398, + "kl": 0.36865234375, + "learning_rate": 7.793999999999999e-07, + "loss": 0.0147, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3309 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.9375, + "epoch": 0.44133333333333336, + "grad_norm": 7.87222957611084, + "kl": 0.25244140625, + "learning_rate": 7.793333333333333e-07, + "loss": 0.0101, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3310 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.75, + "epoch": 0.4414666666666667, + "grad_norm": 7.211256504058838, + "kl": 0.220703125, + "learning_rate": 7.792666666666666e-07, + "loss": 0.0088, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 3311 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.25, + "epoch": 0.4416, + "grad_norm": 7.358504295349121, + "kl": 0.21435546875, + "learning_rate": 7.792e-07, + "loss": 0.0085, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3312 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.5, + "epoch": 0.4417333333333333, + "grad_norm": 11.28618049621582, + "kl": 0.2802734375, + "learning_rate": 7.791333333333333e-07, + "loss": 0.0112, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3313 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.75, + "epoch": 0.4418666666666667, + "grad_norm": 6.219794750213623, + "kl": 0.220703125, + "learning_rate": 7.790666666666667e-07, + "loss": 0.0088, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3314 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.0, + "epoch": 0.442, + "grad_norm": 9.236985206604004, + "kl": 0.23486328125, + "learning_rate": 7.79e-07, + "loss": 0.0094, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3315 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.5625, + "epoch": 0.4421333333333333, + "grad_norm": 39.67341613769531, + "kl": 0.3779296875, + "learning_rate": 7.789333333333334e-07, + "loss": 0.0151, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3316 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.0, + "epoch": 0.44226666666666664, + "grad_norm": 6.4001030921936035, + "kl": 0.27783203125, + "learning_rate": 7.788666666666667e-07, + "loss": 0.0111, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3317 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.0625, + "epoch": 0.4424, + "grad_norm": 0.3339773416519165, + "kl": 0.2158203125, + "learning_rate": 7.788000000000001e-07, + "loss": 0.0086, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3318 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.3125, + "epoch": 0.44253333333333333, + "grad_norm": 4.084094524383545, + "kl": 0.25830078125, + "learning_rate": 7.787333333333334e-07, + "loss": 0.0104, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3319 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.9375, + "epoch": 0.44266666666666665, + "grad_norm": 8.161017417907715, + "kl": 0.2158203125, + "learning_rate": 7.786666666666665e-07, + "loss": 0.0086, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3320 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.6875, + "epoch": 0.4428, + "grad_norm": 6.50437068939209, + "kl": 0.21875, + "learning_rate": 7.785999999999999e-07, + "loss": 0.0087, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3321 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.375, + "epoch": 0.44293333333333335, + "grad_norm": 0.5384450554847717, + "kl": 0.27734375, + "learning_rate": 7.785333333333332e-07, + "loss": 0.0111, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3322 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.3125, + "epoch": 0.44306666666666666, + "grad_norm": 7.680935382843018, + "kl": 0.18505859375, + "learning_rate": 7.784666666666666e-07, + "loss": 0.0074, + "reward": 1.375, + "reward_std": 0.579209566116333, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3323 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.5, + "epoch": 0.4432, + "grad_norm": 4.1707234382629395, + "kl": 0.3125, + "learning_rate": 7.783999999999999e-07, + "loss": 0.0125, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3324 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.0625, + "epoch": 0.44333333333333336, + "grad_norm": 17.485759735107422, + "kl": 0.21337890625, + "learning_rate": 7.783333333333333e-07, + "loss": 0.0085, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 3325 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.5625, + "epoch": 0.4434666666666667, + "grad_norm": 7.397639751434326, + "kl": 0.21923828125, + "learning_rate": 7.782666666666666e-07, + "loss": 0.0088, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3326 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.3125, + "epoch": 0.4436, + "grad_norm": 5.709939002990723, + "kl": 0.279296875, + "learning_rate": 7.782e-07, + "loss": 0.0112, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3327 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.5625, + "epoch": 0.4437333333333333, + "grad_norm": 7.7006378173828125, + "kl": 0.5537109375, + "learning_rate": 7.781333333333333e-07, + "loss": 0.0221, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 3328 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.25, + "epoch": 0.4438666666666667, + "grad_norm": 7.522302627563477, + "kl": 0.16552734375, + "learning_rate": 7.780666666666666e-07, + "loss": 0.0066, + "reward": 1.25, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 3329 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.8125, + "epoch": 0.444, + "grad_norm": 7.107240676879883, + "kl": 0.2080078125, + "learning_rate": 7.78e-07, + "loss": 0.0083, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3330 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.875, + "epoch": 0.4441333333333333, + "grad_norm": 9.525859832763672, + "kl": 0.392578125, + "learning_rate": 7.779333333333333e-07, + "loss": 0.0157, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 3331 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.4375, + "epoch": 0.44426666666666664, + "grad_norm": 4.994140625, + "kl": 0.236328125, + "learning_rate": 7.778666666666667e-07, + "loss": 0.0094, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3332 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.75, + "epoch": 0.4444, + "grad_norm": 33.21808624267578, + "kl": 0.2255859375, + "learning_rate": 7.778e-07, + "loss": 0.009, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3333 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.375, + "epoch": 0.44453333333333334, + "grad_norm": 4.927210807800293, + "kl": 0.2021484375, + "learning_rate": 7.777333333333334e-07, + "loss": 0.0081, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 3334 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.375, + "epoch": 0.44466666666666665, + "grad_norm": 9.103181838989258, + "kl": 0.3876953125, + "learning_rate": 7.776666666666666e-07, + "loss": 0.0155, + "reward": 1.5625, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 3335 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.5625, + "epoch": 0.4448, + "grad_norm": 8.071831703186035, + "kl": 0.2236328125, + "learning_rate": 7.776e-07, + "loss": 0.0089, + "reward": 1.5, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3336 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.5, + "epoch": 0.44493333333333335, + "grad_norm": 85.82416534423828, + "kl": 0.1962890625, + "learning_rate": 7.775333333333333e-07, + "loss": 0.0078, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3337 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.0625, + "epoch": 0.44506666666666667, + "grad_norm": 6.097800254821777, + "kl": 0.18701171875, + "learning_rate": 7.774666666666666e-07, + "loss": 0.0075, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3338 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.625, + "epoch": 0.4452, + "grad_norm": 5.98917293548584, + "kl": 0.1923828125, + "learning_rate": 7.774e-07, + "loss": 0.0077, + "reward": 1.4375, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3339 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.875, + "epoch": 0.44533333333333336, + "grad_norm": 6.887094020843506, + "kl": 0.27197265625, + "learning_rate": 7.773333333333333e-07, + "loss": 0.0109, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3340 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.0, + "epoch": 0.4454666666666667, + "grad_norm": 8.438331604003906, + "kl": 0.29248046875, + "learning_rate": 7.772666666666667e-07, + "loss": 0.0117, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3341 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.1875, + "epoch": 0.4456, + "grad_norm": 0.4413064122200012, + "kl": 0.24609375, + "learning_rate": 7.771999999999999e-07, + "loss": 0.0098, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 3342 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.3125, + "epoch": 0.4457333333333333, + "grad_norm": 8.054128646850586, + "kl": 0.37158203125, + "learning_rate": 7.771333333333333e-07, + "loss": 0.0149, + "reward": 1.4375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 3343 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.625, + "epoch": 0.4458666666666667, + "grad_norm": 1.0810668468475342, + "kl": 0.32421875, + "learning_rate": 7.770666666666666e-07, + "loss": 0.013, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3344 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.5, + "epoch": 0.446, + "grad_norm": 4.895350933074951, + "kl": 0.1591796875, + "learning_rate": 7.77e-07, + "loss": 0.0064, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 3345 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.75, + "epoch": 0.4461333333333333, + "grad_norm": 7.199577331542969, + "kl": 0.224609375, + "learning_rate": 7.769333333333333e-07, + "loss": 0.009, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3346 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.6875, + "epoch": 0.44626666666666664, + "grad_norm": 15.911148071289062, + "kl": 0.23828125, + "learning_rate": 7.768666666666666e-07, + "loss": 0.0095, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3347 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.75, + "epoch": 0.4464, + "grad_norm": 4.231618881225586, + "kl": 0.28857421875, + "learning_rate": 7.768e-07, + "loss": 0.0116, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3348 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.125, + "epoch": 0.44653333333333334, + "grad_norm": 8.31259822845459, + "kl": 0.3310546875, + "learning_rate": 7.767333333333333e-07, + "loss": 0.0133, + "reward": 1.625, + "reward_std": 0.7315178513526917, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 3349 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.0, + "epoch": 0.44666666666666666, + "grad_norm": 7.584954261779785, + "kl": 0.17578125, + "learning_rate": 7.766666666666666e-07, + "loss": 0.007, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3350 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.5625, + "epoch": 0.4468, + "grad_norm": 4.71063756942749, + "kl": 0.1435546875, + "learning_rate": 7.765999999999999e-07, + "loss": 0.0057, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3351 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.6875, + "epoch": 0.44693333333333335, + "grad_norm": 4.893280982971191, + "kl": 0.23388671875, + "learning_rate": 7.765333333333333e-07, + "loss": 0.0094, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3352 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.6875, + "epoch": 0.44706666666666667, + "grad_norm": 0.3142944574356079, + "kl": 0.244140625, + "learning_rate": 7.764666666666666e-07, + "loss": 0.0098, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3353 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.3125, + "epoch": 0.4472, + "grad_norm": 7.137091159820557, + "kl": 0.27392578125, + "learning_rate": 7.764e-07, + "loss": 0.011, + "reward": 1.5, + "reward_std": 0.7440237998962402, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 3354 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.1875, + "epoch": 0.44733333333333336, + "grad_norm": 15.39418888092041, + "kl": 0.2021484375, + "learning_rate": 7.763333333333333e-07, + "loss": 0.0081, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3355 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.0625, + "epoch": 0.4474666666666667, + "grad_norm": 5.978438854217529, + "kl": 0.29443359375, + "learning_rate": 7.762666666666666e-07, + "loss": 0.0118, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3356 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.375, + "epoch": 0.4476, + "grad_norm": 396.6881103515625, + "kl": 0.76953125, + "learning_rate": 7.762e-07, + "loss": 0.0308, + "reward": 1.1875, + "reward_std": 0.7253239452838898, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.8125, + "step": 3357 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.5, + "epoch": 0.4477333333333333, + "grad_norm": 7.113209247589111, + "kl": 0.25341796875, + "learning_rate": 7.761333333333333e-07, + "loss": 0.0101, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3358 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.5, + "epoch": 0.4478666666666667, + "grad_norm": 4.3992156982421875, + "kl": 0.19580078125, + "learning_rate": 7.760666666666667e-07, + "loss": 0.0078, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 3359 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.6875, + "epoch": 0.448, + "grad_norm": 4.3407511711120605, + "kl": 0.2216796875, + "learning_rate": 7.76e-07, + "loss": 0.0089, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 3360 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.8125, + "epoch": 0.44813333333333333, + "grad_norm": 1.45750093460083, + "kl": 0.26513671875, + "learning_rate": 7.759333333333334e-07, + "loss": 0.0106, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3361 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.375, + "epoch": 0.44826666666666665, + "grad_norm": 18.74249267578125, + "kl": 0.22802734375, + "learning_rate": 7.758666666666667e-07, + "loss": 0.0091, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3362 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.6875, + "epoch": 0.4484, + "grad_norm": 6.947057723999023, + "kl": 0.22119140625, + "learning_rate": 7.758e-07, + "loss": 0.0088, + "reward": 1.625, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 3363 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.625, + "epoch": 0.44853333333333334, + "grad_norm": 6.26716423034668, + "kl": 0.24267578125, + "learning_rate": 7.757333333333333e-07, + "loss": 0.0097, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 3364 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.3125, + "epoch": 0.44866666666666666, + "grad_norm": 6.381173133850098, + "kl": 0.19287109375, + "learning_rate": 7.756666666666665e-07, + "loss": 0.0077, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3365 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.75, + "epoch": 0.4488, + "grad_norm": 8.067770004272461, + "kl": 0.248046875, + "learning_rate": 7.755999999999999e-07, + "loss": 0.0099, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 3366 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.9375, + "epoch": 0.44893333333333335, + "grad_norm": 7.428325176239014, + "kl": 0.23779296875, + "learning_rate": 7.755333333333332e-07, + "loss": 0.0095, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3367 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.5625, + "epoch": 0.44906666666666667, + "grad_norm": 7.3597235679626465, + "kl": 0.17041015625, + "learning_rate": 7.754666666666666e-07, + "loss": 0.0068, + "reward": 1.6875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 3368 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.6875, + "epoch": 0.4492, + "grad_norm": 7.347670555114746, + "kl": 0.19189453125, + "learning_rate": 7.753999999999999e-07, + "loss": 0.0077, + "reward": 1.375, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3369 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.125, + "epoch": 0.4493333333333333, + "grad_norm": 5.308242321014404, + "kl": 0.19482421875, + "learning_rate": 7.753333333333333e-07, + "loss": 0.0078, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3370 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.5, + "epoch": 0.4494666666666667, + "grad_norm": 6.3109636306762695, + "kl": 0.20068359375, + "learning_rate": 7.752666666666666e-07, + "loss": 0.008, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3371 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.3125, + "epoch": 0.4496, + "grad_norm": 7.403562545776367, + "kl": 0.2158203125, + "learning_rate": 7.752e-07, + "loss": 0.0086, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3372 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.3125, + "epoch": 0.4497333333333333, + "grad_norm": 7.161877155303955, + "kl": 0.34521484375, + "learning_rate": 7.751333333333333e-07, + "loss": 0.0138, + "reward": 1.4375, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3373 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.8125, + "epoch": 0.4498666666666667, + "grad_norm": 5.103109836578369, + "kl": 0.27783203125, + "learning_rate": 7.750666666666667e-07, + "loss": 0.0111, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 3374 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.25, + "epoch": 0.45, + "grad_norm": 9.144184112548828, + "kl": 0.27490234375, + "learning_rate": 7.75e-07, + "loss": 0.011, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 3375 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.3125, + "epoch": 0.45013333333333333, + "grad_norm": 5.984171390533447, + "kl": 0.17578125, + "learning_rate": 7.749333333333333e-07, + "loss": 0.007, + "reward": 1.625, + "reward_std": 0.6348394006490707, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 3376 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.0625, + "epoch": 0.45026666666666665, + "grad_norm": 6.006974697113037, + "kl": 0.25927734375, + "learning_rate": 7.748666666666667e-07, + "loss": 0.0104, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 3377 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.5625, + "epoch": 0.4504, + "grad_norm": 6.69942569732666, + "kl": 0.151611328125, + "learning_rate": 7.748e-07, + "loss": 0.0061, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3378 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.3125, + "epoch": 0.45053333333333334, + "grad_norm": 6.474479675292969, + "kl": 0.22412109375, + "learning_rate": 7.747333333333334e-07, + "loss": 0.009, + "reward": 1.0625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 3379 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.5625, + "epoch": 0.45066666666666666, + "grad_norm": 6.586063861846924, + "kl": 0.21435546875, + "learning_rate": 7.746666666666666e-07, + "loss": 0.0086, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3380 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.75, + "epoch": 0.4508, + "grad_norm": 5.627462387084961, + "kl": 0.296875, + "learning_rate": 7.746e-07, + "loss": 0.0119, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3381 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.125, + "epoch": 0.45093333333333335, + "grad_norm": 10.72441577911377, + "kl": 0.2626953125, + "learning_rate": 7.745333333333333e-07, + "loss": 0.0105, + "reward": 1.4375, + "reward_std": 0.8152145147323608, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.8125, + "step": 3382 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.1875, + "epoch": 0.45106666666666667, + "grad_norm": 7.505021095275879, + "kl": 0.21435546875, + "learning_rate": 7.744666666666667e-07, + "loss": 0.0086, + "reward": 1.5, + "reward_std": 0.6452257037162781, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3383 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.0, + "epoch": 0.4512, + "grad_norm": 8.469161033630371, + "kl": 0.1669921875, + "learning_rate": 7.743999999999999e-07, + "loss": 0.0067, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 3384 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.125, + "epoch": 0.4513333333333333, + "grad_norm": 11.1008882522583, + "kl": 0.2841796875, + "learning_rate": 7.743333333333332e-07, + "loss": 0.0114, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3385 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.25, + "epoch": 0.4514666666666667, + "grad_norm": 7.141878604888916, + "kl": 0.2294921875, + "learning_rate": 7.742666666666666e-07, + "loss": 0.0092, + "reward": 1.4375, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3386 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.875, + "epoch": 0.4516, + "grad_norm": 10.821825981140137, + "kl": 0.42919921875, + "learning_rate": 7.741999999999999e-07, + "loss": 0.0172, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3387 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.375, + "epoch": 0.4517333333333333, + "grad_norm": 6.876591682434082, + "kl": 0.21875, + "learning_rate": 7.741333333333333e-07, + "loss": 0.0088, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3388 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.5, + "epoch": 0.4518666666666667, + "grad_norm": 8.111998558044434, + "kl": 0.24169921875, + "learning_rate": 7.740666666666666e-07, + "loss": 0.0097, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 3389 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.0625, + "epoch": 0.452, + "grad_norm": 4.6714982986450195, + "kl": 0.20751953125, + "learning_rate": 7.74e-07, + "loss": 0.0083, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 3390 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.375, + "epoch": 0.45213333333333333, + "grad_norm": 7.773019790649414, + "kl": 0.248046875, + "learning_rate": 7.739333333333333e-07, + "loss": 0.0099, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3391 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.8125, + "epoch": 0.45226666666666665, + "grad_norm": 4.376029968261719, + "kl": 0.193359375, + "learning_rate": 7.738666666666667e-07, + "loss": 0.0077, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3392 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.4375, + "epoch": 0.4524, + "grad_norm": 7.176571846008301, + "kl": 0.1982421875, + "learning_rate": 7.738e-07, + "loss": 0.0079, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3393 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.75, + "epoch": 0.45253333333333334, + "grad_norm": 8.302786827087402, + "kl": 0.19775390625, + "learning_rate": 7.737333333333333e-07, + "loss": 0.0079, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3394 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.625, + "epoch": 0.45266666666666666, + "grad_norm": 6.554755210876465, + "kl": 0.23779296875, + "learning_rate": 7.736666666666666e-07, + "loss": 0.0095, + "reward": 1.5, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 3395 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.4375, + "epoch": 0.4528, + "grad_norm": 5.813750267028809, + "kl": 0.23046875, + "learning_rate": 7.735999999999999e-07, + "loss": 0.0092, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3396 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.3125, + "epoch": 0.45293333333333335, + "grad_norm": 0.5142168998718262, + "kl": 0.21142578125, + "learning_rate": 7.735333333333333e-07, + "loss": 0.0085, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3397 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.6875, + "epoch": 0.4530666666666667, + "grad_norm": 5.357003211975098, + "kl": 0.205078125, + "learning_rate": 7.734666666666666e-07, + "loss": 0.0082, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3398 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.5625, + "epoch": 0.4532, + "grad_norm": 32.36801528930664, + "kl": 0.2890625, + "learning_rate": 7.734e-07, + "loss": 0.0115, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3399 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.3125, + "epoch": 0.4533333333333333, + "grad_norm": 7.083343982696533, + "kl": 0.2060546875, + "learning_rate": 7.733333333333333e-07, + "loss": 0.0082, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3400 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.3125, + "epoch": 0.4534666666666667, + "grad_norm": 5.604541301727295, + "kl": 0.2685546875, + "learning_rate": 7.732666666666667e-07, + "loss": 0.0107, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3401 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.1875, + "epoch": 0.4536, + "grad_norm": 14.632474899291992, + "kl": 0.296875, + "learning_rate": 7.732e-07, + "loss": 0.0119, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3402 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.0625, + "epoch": 0.4537333333333333, + "grad_norm": 7.520073890686035, + "kl": 0.197265625, + "learning_rate": 7.731333333333333e-07, + "loss": 0.0079, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3403 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.625, + "epoch": 0.45386666666666664, + "grad_norm": 3.629021644592285, + "kl": 0.16015625, + "learning_rate": 7.730666666666667e-07, + "loss": 0.0064, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3404 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.125, + "epoch": 0.454, + "grad_norm": 9.012198448181152, + "kl": 0.2099609375, + "learning_rate": 7.729999999999999e-07, + "loss": 0.0084, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3405 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.875, + "epoch": 0.45413333333333333, + "grad_norm": 6.231736183166504, + "kl": 0.19091796875, + "learning_rate": 7.729333333333333e-07, + "loss": 0.0076, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 3406 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.875, + "epoch": 0.45426666666666665, + "grad_norm": 5.901536464691162, + "kl": 0.24853515625, + "learning_rate": 7.728666666666666e-07, + "loss": 0.0099, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3407 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.3125, + "epoch": 0.4544, + "grad_norm": 5.500549793243408, + "kl": 0.17333984375, + "learning_rate": 7.728e-07, + "loss": 0.0069, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3408 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.75, + "epoch": 0.45453333333333334, + "grad_norm": 9.700502395629883, + "kl": 0.16455078125, + "learning_rate": 7.727333333333333e-07, + "loss": 0.0066, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3409 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.5, + "epoch": 0.45466666666666666, + "grad_norm": 5.656438827514648, + "kl": 0.1591796875, + "learning_rate": 7.726666666666666e-07, + "loss": 0.0064, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3410 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.5625, + "epoch": 0.4548, + "grad_norm": 7.093088150024414, + "kl": 0.20458984375, + "learning_rate": 7.725999999999999e-07, + "loss": 0.0082, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3411 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.125, + "epoch": 0.45493333333333336, + "grad_norm": 6.901106357574463, + "kl": 0.20458984375, + "learning_rate": 7.725333333333332e-07, + "loss": 0.0082, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3412 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.6875, + "epoch": 0.4550666666666667, + "grad_norm": 3.6936278343200684, + "kl": 0.24072265625, + "learning_rate": 7.724666666666666e-07, + "loss": 0.0096, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 3413 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.375, + "epoch": 0.4552, + "grad_norm": 9.051505088806152, + "kl": 0.22802734375, + "learning_rate": 7.723999999999999e-07, + "loss": 0.0091, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3414 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.5625, + "epoch": 0.4553333333333333, + "grad_norm": 5.08579158782959, + "kl": 0.25439453125, + "learning_rate": 7.723333333333333e-07, + "loss": 0.0102, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3415 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.375, + "epoch": 0.4554666666666667, + "grad_norm": 4.4787421226501465, + "kl": 0.23779296875, + "learning_rate": 7.722666666666666e-07, + "loss": 0.0095, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3416 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.9375, + "epoch": 0.4556, + "grad_norm": 6.4084296226501465, + "kl": 0.13525390625, + "learning_rate": 7.722e-07, + "loss": 0.0054, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3417 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.625, + "epoch": 0.4557333333333333, + "grad_norm": 8.912191390991211, + "kl": 0.2548828125, + "learning_rate": 7.721333333333333e-07, + "loss": 0.0102, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3418 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.75, + "epoch": 0.45586666666666664, + "grad_norm": 7.346100330352783, + "kl": 0.3623046875, + "learning_rate": 7.720666666666667e-07, + "loss": 0.0145, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3419 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.125, + "epoch": 0.456, + "grad_norm": 6.9774675369262695, + "kl": 0.193359375, + "learning_rate": 7.72e-07, + "loss": 0.0077, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3420 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.9375, + "epoch": 0.45613333333333334, + "grad_norm": 4.3728928565979, + "kl": 0.2177734375, + "learning_rate": 7.719333333333334e-07, + "loss": 0.0087, + "reward": 1.4375, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 3421 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.125, + "epoch": 0.45626666666666665, + "grad_norm": 11.431668281555176, + "kl": 0.20263671875, + "learning_rate": 7.718666666666667e-07, + "loss": 0.0081, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 3422 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.125, + "epoch": 0.4564, + "grad_norm": 7.738337516784668, + "kl": 0.2021484375, + "learning_rate": 7.718e-07, + "loss": 0.0081, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3423 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.5625, + "epoch": 0.45653333333333335, + "grad_norm": 8.314494132995605, + "kl": 0.228515625, + "learning_rate": 7.717333333333334e-07, + "loss": 0.0091, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 3424 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.0, + "epoch": 0.45666666666666667, + "grad_norm": 5.7602925300598145, + "kl": 0.21630859375, + "learning_rate": 7.716666666666665e-07, + "loss": 0.0087, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3425 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.375, + "epoch": 0.4568, + "grad_norm": 2.043029308319092, + "kl": 0.3466796875, + "learning_rate": 7.716e-07, + "loss": 0.0139, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3426 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.0, + "epoch": 0.45693333333333336, + "grad_norm": 5.856171131134033, + "kl": 0.212890625, + "learning_rate": 7.715333333333332e-07, + "loss": 0.0085, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3427 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.5625, + "epoch": 0.4570666666666667, + "grad_norm": 5.673068523406982, + "kl": 0.22265625, + "learning_rate": 7.714666666666666e-07, + "loss": 0.0089, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3428 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.0625, + "epoch": 0.4572, + "grad_norm": 0.39507830142974854, + "kl": 0.22021484375, + "learning_rate": 7.713999999999999e-07, + "loss": 0.0088, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3429 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.9375, + "epoch": 0.4573333333333333, + "grad_norm": 4.550883769989014, + "kl": 0.2490234375, + "learning_rate": 7.713333333333333e-07, + "loss": 0.0099, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3430 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.375, + "epoch": 0.4574666666666667, + "grad_norm": 8.065049171447754, + "kl": 0.24951171875, + "learning_rate": 7.712666666666666e-07, + "loss": 0.01, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3431 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.75, + "epoch": 0.4576, + "grad_norm": 4.6375250816345215, + "kl": 0.202880859375, + "learning_rate": 7.711999999999999e-07, + "loss": 0.0081, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3432 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.8125, + "epoch": 0.4577333333333333, + "grad_norm": 0.4946199357509613, + "kl": 0.24658203125, + "learning_rate": 7.711333333333333e-07, + "loss": 0.0099, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3433 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.75, + "epoch": 0.45786666666666664, + "grad_norm": 5.408483028411865, + "kl": 0.1708984375, + "learning_rate": 7.710666666666666e-07, + "loss": 0.0068, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 3434 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.0625, + "epoch": 0.458, + "grad_norm": 7.288025856018066, + "kl": 0.23291015625, + "learning_rate": 7.71e-07, + "loss": 0.0093, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3435 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.75, + "epoch": 0.45813333333333334, + "grad_norm": 11.218524932861328, + "kl": 0.46240234375, + "learning_rate": 7.709333333333333e-07, + "loss": 0.0184, + "reward": 1.3125, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 3436 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.4375, + "epoch": 0.45826666666666666, + "grad_norm": 5.577374458312988, + "kl": 0.15673828125, + "learning_rate": 7.708666666666667e-07, + "loss": 0.0063, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3437 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.25, + "epoch": 0.4584, + "grad_norm": 0.672066867351532, + "kl": 0.22802734375, + "learning_rate": 7.708e-07, + "loss": 0.0091, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3438 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.375, + "epoch": 0.45853333333333335, + "grad_norm": 12.118697166442871, + "kl": 0.4482421875, + "learning_rate": 7.707333333333334e-07, + "loss": 0.018, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 3439 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.4375, + "epoch": 0.45866666666666667, + "grad_norm": 29.966297149658203, + "kl": 0.224609375, + "learning_rate": 7.706666666666667e-07, + "loss": 0.009, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3440 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.5, + "epoch": 0.4588, + "grad_norm": 7.4927496910095215, + "kl": 0.1748046875, + "learning_rate": 7.705999999999999e-07, + "loss": 0.007, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3441 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.8125, + "epoch": 0.45893333333333336, + "grad_norm": 11.758223533630371, + "kl": 0.24658203125, + "learning_rate": 7.705333333333333e-07, + "loss": 0.0098, + "reward": 1.1875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 3442 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.875, + "epoch": 0.4590666666666667, + "grad_norm": 0.456427663564682, + "kl": 0.20751953125, + "learning_rate": 7.704666666666666e-07, + "loss": 0.0083, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3443 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.5, + "epoch": 0.4592, + "grad_norm": 44.808658599853516, + "kl": 0.64208984375, + "learning_rate": 7.704e-07, + "loss": 0.0257, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 3444 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.9375, + "epoch": 0.4593333333333333, + "grad_norm": 6.0161027908325195, + "kl": 0.2841796875, + "learning_rate": 7.703333333333333e-07, + "loss": 0.0114, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3445 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.3125, + "epoch": 0.4594666666666667, + "grad_norm": 5.677512168884277, + "kl": 0.169921875, + "learning_rate": 7.702666666666667e-07, + "loss": 0.0068, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3446 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.1875, + "epoch": 0.4596, + "grad_norm": 0.7152519226074219, + "kl": 0.2216796875, + "learning_rate": 7.702e-07, + "loss": 0.0089, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3447 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.9375, + "epoch": 0.4597333333333333, + "grad_norm": 3.183403491973877, + "kl": 0.2548828125, + "learning_rate": 7.701333333333333e-07, + "loss": 0.0102, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3448 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.875, + "epoch": 0.45986666666666665, + "grad_norm": 7.327603340148926, + "kl": 0.17138671875, + "learning_rate": 7.700666666666666e-07, + "loss": 0.0068, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3449 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.1875, + "epoch": 0.46, + "grad_norm": 6.3335442543029785, + "kl": 0.27490234375, + "learning_rate": 7.699999999999999e-07, + "loss": 0.011, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 3450 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.1875, + "epoch": 0.46013333333333334, + "grad_norm": 7.61722469329834, + "kl": 0.1689453125, + "learning_rate": 7.699333333333333e-07, + "loss": 0.0068, + "reward": 1.4375, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3451 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.1875, + "epoch": 0.46026666666666666, + "grad_norm": 6.451989650726318, + "kl": 0.4052734375, + "learning_rate": 7.698666666666666e-07, + "loss": 0.0162, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 3452 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.25, + "epoch": 0.4604, + "grad_norm": 7.139161109924316, + "kl": 0.2666015625, + "learning_rate": 7.698e-07, + "loss": 0.0107, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3453 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.5, + "epoch": 0.46053333333333335, + "grad_norm": 5.151640892028809, + "kl": 0.21240234375, + "learning_rate": 7.697333333333333e-07, + "loss": 0.0085, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 3454 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.0, + "epoch": 0.46066666666666667, + "grad_norm": 12.315130233764648, + "kl": 0.16015625, + "learning_rate": 7.696666666666667e-07, + "loss": 0.0064, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3455 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.25, + "epoch": 0.4608, + "grad_norm": 5.195484638214111, + "kl": 0.234375, + "learning_rate": 7.695999999999999e-07, + "loss": 0.0094, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 3456 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.875, + "epoch": 0.4609333333333333, + "grad_norm": 5.884009838104248, + "kl": 0.18212890625, + "learning_rate": 7.695333333333333e-07, + "loss": 0.0073, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3457 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.9375, + "epoch": 0.4610666666666667, + "grad_norm": 8.88394832611084, + "kl": 0.22509765625, + "learning_rate": 7.694666666666666e-07, + "loss": 0.009, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3458 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.9375, + "epoch": 0.4612, + "grad_norm": 6.804621696472168, + "kl": 0.158203125, + "learning_rate": 7.693999999999999e-07, + "loss": 0.0063, + "reward": 1.125, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 3459 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.9375, + "epoch": 0.4613333333333333, + "grad_norm": 0.295283704996109, + "kl": 0.1875, + "learning_rate": 7.693333333333333e-07, + "loss": 0.0075, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3460 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.625, + "epoch": 0.4614666666666667, + "grad_norm": 8.715941429138184, + "kl": 0.234375, + "learning_rate": 7.692666666666666e-07, + "loss": 0.0094, + "reward": 1.375, + "reward_std": 0.579209566116333, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3461 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.75, + "epoch": 0.4616, + "grad_norm": 7.494444847106934, + "kl": 0.197265625, + "learning_rate": 7.692e-07, + "loss": 0.0079, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3462 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.625, + "epoch": 0.46173333333333333, + "grad_norm": 6.413173198699951, + "kl": 0.2109375, + "learning_rate": 7.691333333333333e-07, + "loss": 0.0084, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3463 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.375, + "epoch": 0.46186666666666665, + "grad_norm": 1.3636583089828491, + "kl": 0.1826171875, + "learning_rate": 7.690666666666667e-07, + "loss": 0.0073, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3464 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.0625, + "epoch": 0.462, + "grad_norm": 10.377541542053223, + "kl": 0.2490234375, + "learning_rate": 7.69e-07, + "loss": 0.01, + "reward": 1.3125, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 3465 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.8125, + "epoch": 0.46213333333333334, + "grad_norm": 66.25886535644531, + "kl": 0.267578125, + "learning_rate": 7.689333333333334e-07, + "loss": 0.0107, + "reward": 1.625, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 3466 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.3125, + "epoch": 0.46226666666666666, + "grad_norm": 4.145279407501221, + "kl": 0.14697265625, + "learning_rate": 7.688666666666667e-07, + "loss": 0.0059, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 3467 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.6875, + "epoch": 0.4624, + "grad_norm": 4.745988845825195, + "kl": 0.2626953125, + "learning_rate": 7.688000000000001e-07, + "loss": 0.0105, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 3468 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.0, + "epoch": 0.46253333333333335, + "grad_norm": 7.347900390625, + "kl": 0.2177734375, + "learning_rate": 7.687333333333333e-07, + "loss": 0.0087, + "reward": 1.3125, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 3469 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.375, + "epoch": 0.46266666666666667, + "grad_norm": 10.595775604248047, + "kl": 0.3505859375, + "learning_rate": 7.686666666666666e-07, + "loss": 0.014, + "reward": 1.5, + "reward_std": 0.7071067690849304, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 3470 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.75, + "epoch": 0.4628, + "grad_norm": 8.805889129638672, + "kl": 0.3359375, + "learning_rate": 7.685999999999999e-07, + "loss": 0.0134, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.875, + "step": 3471 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.875, + "epoch": 0.4629333333333333, + "grad_norm": 5.607431411743164, + "kl": 0.2080078125, + "learning_rate": 7.685333333333332e-07, + "loss": 0.0083, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3472 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.5, + "epoch": 0.4630666666666667, + "grad_norm": 9.280106544494629, + "kl": 0.2265625, + "learning_rate": 7.684666666666666e-07, + "loss": 0.009, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 3473 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.8125, + "epoch": 0.4632, + "grad_norm": 5.368020057678223, + "kl": 0.31494140625, + "learning_rate": 7.683999999999999e-07, + "loss": 0.0126, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 3474 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.75, + "epoch": 0.4633333333333333, + "grad_norm": 8.576275825500488, + "kl": 0.193359375, + "learning_rate": 7.683333333333333e-07, + "loss": 0.0077, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 3475 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.375, + "epoch": 0.4634666666666667, + "grad_norm": 6.915533542633057, + "kl": 0.169921875, + "learning_rate": 7.682666666666666e-07, + "loss": 0.0068, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3476 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.1875, + "epoch": 0.4636, + "grad_norm": 7.154943466186523, + "kl": 0.1865234375, + "learning_rate": 7.682e-07, + "loss": 0.0075, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3477 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.75, + "epoch": 0.46373333333333333, + "grad_norm": 7.4159111976623535, + "kl": 0.21337890625, + "learning_rate": 7.681333333333333e-07, + "loss": 0.0085, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 3478 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.6875, + "epoch": 0.46386666666666665, + "grad_norm": 4.171408176422119, + "kl": 0.29296875, + "learning_rate": 7.680666666666666e-07, + "loss": 0.0117, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 3479 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.125, + "epoch": 0.464, + "grad_norm": 8.962077140808105, + "kl": 0.22021484375, + "learning_rate": 7.68e-07, + "loss": 0.0088, + "reward": 1.4375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3480 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.5, + "epoch": 0.46413333333333334, + "grad_norm": 4.280847549438477, + "kl": 0.1787109375, + "learning_rate": 7.679333333333333e-07, + "loss": 0.0071, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3481 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.875, + "epoch": 0.46426666666666666, + "grad_norm": 6.352588653564453, + "kl": 0.1630859375, + "learning_rate": 7.678666666666667e-07, + "loss": 0.0065, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3482 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.625, + "epoch": 0.4644, + "grad_norm": 7.0486602783203125, + "kl": 0.2861328125, + "learning_rate": 7.678e-07, + "loss": 0.0114, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3483 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.5, + "epoch": 0.46453333333333335, + "grad_norm": 6.763266086578369, + "kl": 0.263671875, + "learning_rate": 7.677333333333334e-07, + "loss": 0.0106, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3484 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.6875, + "epoch": 0.4646666666666667, + "grad_norm": 6.452882289886475, + "kl": 0.130859375, + "learning_rate": 7.676666666666667e-07, + "loss": 0.0052, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3485 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.1875, + "epoch": 0.4648, + "grad_norm": 5.253005504608154, + "kl": 0.19775390625, + "learning_rate": 7.676e-07, + "loss": 0.0079, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 3486 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.8125, + "epoch": 0.4649333333333333, + "grad_norm": 9.856045722961426, + "kl": 0.404296875, + "learning_rate": 7.675333333333333e-07, + "loss": 0.0162, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 3487 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.6875, + "epoch": 0.4650666666666667, + "grad_norm": 7.39785623550415, + "kl": 0.236328125, + "learning_rate": 7.674666666666666e-07, + "loss": 0.0095, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3488 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.875, + "epoch": 0.4652, + "grad_norm": 8.437877655029297, + "kl": 0.2197265625, + "learning_rate": 7.674e-07, + "loss": 0.0088, + "reward": 1.3125, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 3489 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.4375, + "epoch": 0.4653333333333333, + "grad_norm": 7.367069244384766, + "kl": 0.203125, + "learning_rate": 7.673333333333332e-07, + "loss": 0.0081, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3490 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.5625, + "epoch": 0.46546666666666664, + "grad_norm": 5.812874794006348, + "kl": 0.21240234375, + "learning_rate": 7.672666666666666e-07, + "loss": 0.0085, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3491 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.9375, + "epoch": 0.4656, + "grad_norm": 7.162113666534424, + "kl": 0.19384765625, + "learning_rate": 7.671999999999999e-07, + "loss": 0.0077, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3492 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.5, + "epoch": 0.46573333333333333, + "grad_norm": 8.117475509643555, + "kl": 0.19580078125, + "learning_rate": 7.671333333333333e-07, + "loss": 0.0078, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3493 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.5, + "epoch": 0.46586666666666665, + "grad_norm": 0.6458792090415955, + "kl": 0.22998046875, + "learning_rate": 7.670666666666666e-07, + "loss": 0.0092, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3494 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.3125, + "epoch": 0.466, + "grad_norm": 15.852363586425781, + "kl": 0.22900390625, + "learning_rate": 7.67e-07, + "loss": 0.0092, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3495 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.6875, + "epoch": 0.46613333333333334, + "grad_norm": 7.869645595550537, + "kl": 0.26123046875, + "learning_rate": 7.669333333333333e-07, + "loss": 0.0105, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3496 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.0625, + "epoch": 0.46626666666666666, + "grad_norm": 25.17885398864746, + "kl": 0.33251953125, + "learning_rate": 7.668666666666666e-07, + "loss": 0.0133, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 3497 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.125, + "epoch": 0.4664, + "grad_norm": 0.5868118405342102, + "kl": 0.21484375, + "learning_rate": 7.668e-07, + "loss": 0.0086, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3498 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.875, + "epoch": 0.46653333333333336, + "grad_norm": 17.022207260131836, + "kl": 0.6435546875, + "learning_rate": 7.667333333333333e-07, + "loss": 0.0257, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 3499 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.875, + "epoch": 0.4666666666666667, + "grad_norm": 6.065625190734863, + "kl": 0.228515625, + "learning_rate": 7.666666666666667e-07, + "loss": 0.0091, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3500 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.3125, + "epoch": 0.4668, + "grad_norm": 12.065895080566406, + "kl": 0.236328125, + "learning_rate": 7.665999999999999e-07, + "loss": 0.0095, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 3501 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.375, + "epoch": 0.4669333333333333, + "grad_norm": 6.8056745529174805, + "kl": 0.137451171875, + "learning_rate": 7.665333333333333e-07, + "loss": 0.0055, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3502 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.8125, + "epoch": 0.4670666666666667, + "grad_norm": 3.494611978530884, + "kl": 0.2158203125, + "learning_rate": 7.664666666666666e-07, + "loss": 0.0086, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3503 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.5, + "epoch": 0.4672, + "grad_norm": 9.812882423400879, + "kl": 0.2294921875, + "learning_rate": 7.664e-07, + "loss": 0.0092, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 3504 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.6875, + "epoch": 0.4673333333333333, + "grad_norm": 7.788884162902832, + "kl": 0.2138671875, + "learning_rate": 7.663333333333333e-07, + "loss": 0.0086, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3505 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.9375, + "epoch": 0.46746666666666664, + "grad_norm": 36.36988067626953, + "kl": 0.3076171875, + "learning_rate": 7.662666666666666e-07, + "loss": 0.0123, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3506 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.4375, + "epoch": 0.4676, + "grad_norm": 5.148518085479736, + "kl": 0.1923828125, + "learning_rate": 7.662e-07, + "loss": 0.0077, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 3507 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.125, + "epoch": 0.46773333333333333, + "grad_norm": 6.277476787567139, + "kl": 0.27685546875, + "learning_rate": 7.661333333333333e-07, + "loss": 0.0111, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3508 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.75, + "epoch": 0.46786666666666665, + "grad_norm": 8.433329582214355, + "kl": 0.19580078125, + "learning_rate": 7.660666666666667e-07, + "loss": 0.0078, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3509 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.5625, + "epoch": 0.468, + "grad_norm": 5.027157306671143, + "kl": 0.14306640625, + "learning_rate": 7.66e-07, + "loss": 0.0057, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3510 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.6875, + "epoch": 0.46813333333333335, + "grad_norm": 5.899160385131836, + "kl": 0.19873046875, + "learning_rate": 7.659333333333333e-07, + "loss": 0.0079, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3511 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.25, + "epoch": 0.46826666666666666, + "grad_norm": 5.537870407104492, + "kl": 0.21728515625, + "learning_rate": 7.658666666666666e-07, + "loss": 0.0087, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3512 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.6875, + "epoch": 0.4684, + "grad_norm": 3.5042355060577393, + "kl": 0.21240234375, + "learning_rate": 7.658e-07, + "loss": 0.0085, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3513 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.5625, + "epoch": 0.46853333333333336, + "grad_norm": 14.402478218078613, + "kl": 0.50439453125, + "learning_rate": 7.657333333333333e-07, + "loss": 0.0202, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 3514 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.125, + "epoch": 0.4686666666666667, + "grad_norm": 8.12824535369873, + "kl": 0.185546875, + "learning_rate": 7.656666666666667e-07, + "loss": 0.0074, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3515 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.3125, + "epoch": 0.4688, + "grad_norm": 7.006341934204102, + "kl": 0.1943359375, + "learning_rate": 7.655999999999999e-07, + "loss": 0.0078, + "reward": 1.375, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3516 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.6875, + "epoch": 0.4689333333333333, + "grad_norm": 8.59926700592041, + "kl": 0.25048828125, + "learning_rate": 7.655333333333332e-07, + "loss": 0.01, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3517 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.625, + "epoch": 0.4690666666666667, + "grad_norm": 7.152944564819336, + "kl": 0.21240234375, + "learning_rate": 7.654666666666666e-07, + "loss": 0.0085, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3518 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.6875, + "epoch": 0.4692, + "grad_norm": 5.258631706237793, + "kl": 0.2431640625, + "learning_rate": 7.653999999999999e-07, + "loss": 0.0097, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 3519 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.75, + "epoch": 0.4693333333333333, + "grad_norm": 6.678834915161133, + "kl": 0.23193359375, + "learning_rate": 7.653333333333333e-07, + "loss": 0.0093, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3520 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.0625, + "epoch": 0.46946666666666664, + "grad_norm": 7.632673740386963, + "kl": 0.1865234375, + "learning_rate": 7.652666666666666e-07, + "loss": 0.0075, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3521 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.375, + "epoch": 0.4696, + "grad_norm": 5.519421577453613, + "kl": 0.2841796875, + "learning_rate": 7.652e-07, + "loss": 0.0114, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3522 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.125, + "epoch": 0.46973333333333334, + "grad_norm": 8.348705291748047, + "kl": 0.2294921875, + "learning_rate": 7.651333333333333e-07, + "loss": 0.0092, + "reward": 1.5, + "reward_std": 0.6452257037162781, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3523 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.9375, + "epoch": 0.46986666666666665, + "grad_norm": 10.105491638183594, + "kl": 0.1875, + "learning_rate": 7.650666666666667e-07, + "loss": 0.0075, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3524 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.875, + "epoch": 0.47, + "grad_norm": 6.213290214538574, + "kl": 0.30322265625, + "learning_rate": 7.65e-07, + "loss": 0.0122, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3525 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.8125, + "epoch": 0.47013333333333335, + "grad_norm": 4.875824451446533, + "kl": 0.181640625, + "learning_rate": 7.649333333333333e-07, + "loss": 0.0073, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3526 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.4375, + "epoch": 0.47026666666666667, + "grad_norm": 5.301959991455078, + "kl": 0.271484375, + "learning_rate": 7.648666666666667e-07, + "loss": 0.0109, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3527 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.625, + "epoch": 0.4704, + "grad_norm": 7.776271343231201, + "kl": 0.15869140625, + "learning_rate": 7.648e-07, + "loss": 0.0063, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3528 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.4375, + "epoch": 0.47053333333333336, + "grad_norm": 5.5079851150512695, + "kl": 0.232421875, + "learning_rate": 7.647333333333334e-07, + "loss": 0.0093, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3529 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.1875, + "epoch": 0.4706666666666667, + "grad_norm": 6.541909694671631, + "kl": 0.27294921875, + "learning_rate": 7.646666666666667e-07, + "loss": 0.0109, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3530 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.4375, + "epoch": 0.4708, + "grad_norm": 65.48574829101562, + "kl": 2.990234375, + "learning_rate": 7.646e-07, + "loss": 0.1196, + "reward": 1.1875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 3531 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.875, + "epoch": 0.4709333333333333, + "grad_norm": 3.5940473079681396, + "kl": 0.2509765625, + "learning_rate": 7.645333333333332e-07, + "loss": 0.0101, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3532 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.9375, + "epoch": 0.4710666666666667, + "grad_norm": 7.311242580413818, + "kl": 0.18017578125, + "learning_rate": 7.644666666666666e-07, + "loss": 0.0072, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3533 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.3125, + "epoch": 0.4712, + "grad_norm": 7.07169246673584, + "kl": 0.2294921875, + "learning_rate": 7.643999999999999e-07, + "loss": 0.0092, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 3534 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.125, + "epoch": 0.4713333333333333, + "grad_norm": 7.393982887268066, + "kl": 0.1787109375, + "learning_rate": 7.643333333333332e-07, + "loss": 0.0072, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3535 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.3125, + "epoch": 0.47146666666666665, + "grad_norm": 6.941710472106934, + "kl": 0.1875, + "learning_rate": 7.642666666666666e-07, + "loss": 0.0075, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 3536 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.625, + "epoch": 0.4716, + "grad_norm": 5.101339340209961, + "kl": 0.25537109375, + "learning_rate": 7.641999999999999e-07, + "loss": 0.0102, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3537 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.375, + "epoch": 0.47173333333333334, + "grad_norm": 7.647596836090088, + "kl": 0.169921875, + "learning_rate": 7.641333333333333e-07, + "loss": 0.0068, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3538 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.875, + "epoch": 0.47186666666666666, + "grad_norm": 9.739762306213379, + "kl": 0.21533203125, + "learning_rate": 7.640666666666666e-07, + "loss": 0.0086, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3539 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.125, + "epoch": 0.472, + "grad_norm": 10.818222999572754, + "kl": 0.3994140625, + "learning_rate": 7.64e-07, + "loss": 0.016, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3540 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.625, + "epoch": 0.47213333333333335, + "grad_norm": 7.27985954284668, + "kl": 0.18408203125, + "learning_rate": 7.639333333333333e-07, + "loss": 0.0074, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3541 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.5625, + "epoch": 0.47226666666666667, + "grad_norm": 8.504867553710938, + "kl": 0.21728515625, + "learning_rate": 7.638666666666667e-07, + "loss": 0.0087, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3542 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.9375, + "epoch": 0.4724, + "grad_norm": 4.211130142211914, + "kl": 0.22412109375, + "learning_rate": 7.638e-07, + "loss": 0.009, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3543 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.75, + "epoch": 0.47253333333333336, + "grad_norm": 6.628116607666016, + "kl": 0.23876953125, + "learning_rate": 7.637333333333333e-07, + "loss": 0.0096, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3544 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.5, + "epoch": 0.4726666666666667, + "grad_norm": 5.88210916519165, + "kl": 0.1787109375, + "learning_rate": 7.636666666666667e-07, + "loss": 0.0071, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3545 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.0625, + "epoch": 0.4728, + "grad_norm": 7.725909233093262, + "kl": 0.22607421875, + "learning_rate": 7.635999999999999e-07, + "loss": 0.009, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3546 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.9375, + "epoch": 0.4729333333333333, + "grad_norm": 7.544195652008057, + "kl": 0.3291015625, + "learning_rate": 7.635333333333333e-07, + "loss": 0.0132, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3547 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.5, + "epoch": 0.4730666666666667, + "grad_norm": 4.922785758972168, + "kl": 0.240234375, + "learning_rate": 7.634666666666666e-07, + "loss": 0.0096, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3548 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.5625, + "epoch": 0.4732, + "grad_norm": 1.913009524345398, + "kl": 0.3330078125, + "learning_rate": 7.634e-07, + "loss": 0.0133, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 3549 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.0, + "epoch": 0.47333333333333333, + "grad_norm": 4.68765926361084, + "kl": 0.29296875, + "learning_rate": 7.633333333333333e-07, + "loss": 0.0117, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 3550 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.3125, + "epoch": 0.47346666666666665, + "grad_norm": 79.47908020019531, + "kl": 2.69189453125, + "learning_rate": 7.632666666666667e-07, + "loss": 0.1073, + "reward": 0.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8125, + "step": 3551 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.3125, + "epoch": 0.4736, + "grad_norm": 8.415688514709473, + "kl": 0.21826171875, + "learning_rate": 7.632e-07, + "loss": 0.0087, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3552 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.6875, + "epoch": 0.47373333333333334, + "grad_norm": 7.232600212097168, + "kl": 0.2451171875, + "learning_rate": 7.631333333333332e-07, + "loss": 0.0098, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3553 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.6875, + "epoch": 0.47386666666666666, + "grad_norm": 7.391580104827881, + "kl": 0.27099609375, + "learning_rate": 7.630666666666666e-07, + "loss": 0.0108, + "reward": 1.375, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 3554 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.4375, + "epoch": 0.474, + "grad_norm": 6.489584445953369, + "kl": 0.2099609375, + "learning_rate": 7.629999999999999e-07, + "loss": 0.0084, + "reward": 1.6875, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 3555 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.875, + "epoch": 0.47413333333333335, + "grad_norm": 7.938352584838867, + "kl": 0.2587890625, + "learning_rate": 7.629333333333333e-07, + "loss": 0.0104, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 3556 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.9375, + "epoch": 0.47426666666666667, + "grad_norm": 8.191702842712402, + "kl": 0.2021484375, + "learning_rate": 7.628666666666666e-07, + "loss": 0.0081, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3557 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.5625, + "epoch": 0.4744, + "grad_norm": 8.456786155700684, + "kl": 0.27197265625, + "learning_rate": 7.628e-07, + "loss": 0.0109, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 3558 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.375, + "epoch": 0.4745333333333333, + "grad_norm": 19.86947250366211, + "kl": 0.2197265625, + "learning_rate": 7.627333333333333e-07, + "loss": 0.0088, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3559 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.375, + "epoch": 0.4746666666666667, + "grad_norm": 7.800961494445801, + "kl": 0.21826171875, + "learning_rate": 7.626666666666667e-07, + "loss": 0.0087, + "reward": 1.125, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.875, + "step": 3560 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.0625, + "epoch": 0.4748, + "grad_norm": 7.256244659423828, + "kl": 0.181640625, + "learning_rate": 7.625999999999999e-07, + "loss": 0.0073, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3561 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.375, + "epoch": 0.4749333333333333, + "grad_norm": 8.092866897583008, + "kl": 0.19140625, + "learning_rate": 7.625333333333332e-07, + "loss": 0.0077, + "reward": 1.1875, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 3562 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.1875, + "epoch": 0.4750666666666667, + "grad_norm": 9.43298053741455, + "kl": 0.43701171875, + "learning_rate": 7.624666666666666e-07, + "loss": 0.0174, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3563 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.625, + "epoch": 0.4752, + "grad_norm": 8.363844871520996, + "kl": 0.1796875, + "learning_rate": 7.623999999999999e-07, + "loss": 0.0072, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 3564 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.5625, + "epoch": 0.47533333333333333, + "grad_norm": 4.976203441619873, + "kl": 0.19140625, + "learning_rate": 7.623333333333333e-07, + "loss": 0.0077, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3565 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.625, + "epoch": 0.47546666666666665, + "grad_norm": 7.889224529266357, + "kl": 0.2275390625, + "learning_rate": 7.622666666666666e-07, + "loss": 0.0091, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3566 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.9375, + "epoch": 0.4756, + "grad_norm": 6.142086029052734, + "kl": 0.16650390625, + "learning_rate": 7.622e-07, + "loss": 0.0067, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3567 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.25, + "epoch": 0.47573333333333334, + "grad_norm": 11.428969383239746, + "kl": 0.25537109375, + "learning_rate": 7.621333333333333e-07, + "loss": 0.0102, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 3568 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.75, + "epoch": 0.47586666666666666, + "grad_norm": 6.059123992919922, + "kl": 0.17724609375, + "learning_rate": 7.620666666666667e-07, + "loss": 0.0071, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3569 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.5, + "epoch": 0.476, + "grad_norm": 5.621702671051025, + "kl": 0.240234375, + "learning_rate": 7.62e-07, + "loss": 0.0096, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3570 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.25, + "epoch": 0.47613333333333335, + "grad_norm": 10.005193710327148, + "kl": 0.2021484375, + "learning_rate": 7.619333333333334e-07, + "loss": 0.0081, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3571 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.625, + "epoch": 0.47626666666666667, + "grad_norm": 29.58641242980957, + "kl": 2.044921875, + "learning_rate": 7.618666666666667e-07, + "loss": 0.082, + "reward": 1.6875, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.875, + "step": 3572 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.9375, + "epoch": 0.4764, + "grad_norm": 7.440464496612549, + "kl": 0.30810546875, + "learning_rate": 7.618e-07, + "loss": 0.0123, + "reward": 1.1875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 3573 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.8125, + "epoch": 0.4765333333333333, + "grad_norm": 10.0841064453125, + "kl": 0.3798828125, + "learning_rate": 7.617333333333334e-07, + "loss": 0.0152, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3574 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.1875, + "epoch": 0.4766666666666667, + "grad_norm": 6.997726917266846, + "kl": 0.2216796875, + "learning_rate": 7.616666666666666e-07, + "loss": 0.0089, + "reward": 1.5, + "reward_std": 0.6452257037162781, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3575 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.4375, + "epoch": 0.4768, + "grad_norm": 7.888454914093018, + "kl": 0.212890625, + "learning_rate": 7.616e-07, + "loss": 0.0085, + "reward": 1.4375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3576 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.0, + "epoch": 0.4769333333333333, + "grad_norm": 5.133638381958008, + "kl": 0.28955078125, + "learning_rate": 7.615333333333332e-07, + "loss": 0.0116, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 3577 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.125, + "epoch": 0.4770666666666667, + "grad_norm": 4.990627288818359, + "kl": 0.1953125, + "learning_rate": 7.614666666666666e-07, + "loss": 0.0078, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3578 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.0, + "epoch": 0.4772, + "grad_norm": 5.133511066436768, + "kl": 0.3701171875, + "learning_rate": 7.613999999999999e-07, + "loss": 0.0148, + "reward": 1.5, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 3579 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.6875, + "epoch": 0.47733333333333333, + "grad_norm": 7.001753807067871, + "kl": 0.2763671875, + "learning_rate": 7.613333333333333e-07, + "loss": 0.011, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 3580 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.75, + "epoch": 0.47746666666666665, + "grad_norm": 9.616701126098633, + "kl": 0.1640625, + "learning_rate": 7.612666666666666e-07, + "loss": 0.0066, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3581 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.375, + "epoch": 0.4776, + "grad_norm": 8.262381553649902, + "kl": 0.1796875, + "learning_rate": 7.611999999999999e-07, + "loss": 0.0072, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3582 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.375, + "epoch": 0.47773333333333334, + "grad_norm": 7.74766731262207, + "kl": 0.2080078125, + "learning_rate": 7.611333333333333e-07, + "loss": 0.0083, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3583 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.25, + "epoch": 0.47786666666666666, + "grad_norm": 6.417717456817627, + "kl": 0.1904296875, + "learning_rate": 7.610666666666666e-07, + "loss": 0.0076, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 3584 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.375, + "epoch": 0.478, + "grad_norm": 7.023239612579346, + "kl": 0.15185546875, + "learning_rate": 7.61e-07, + "loss": 0.0061, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3585 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.125, + "epoch": 0.47813333333333335, + "grad_norm": 8.014569282531738, + "kl": 0.20947265625, + "learning_rate": 7.609333333333333e-07, + "loss": 0.0084, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3586 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.1875, + "epoch": 0.4782666666666667, + "grad_norm": 8.97590446472168, + "kl": 0.22900390625, + "learning_rate": 7.608666666666667e-07, + "loss": 0.0092, + "reward": 1.625, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 3587 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.375, + "epoch": 0.4784, + "grad_norm": 8.729618072509766, + "kl": 0.24072265625, + "learning_rate": 7.608e-07, + "loss": 0.0096, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3588 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.75, + "epoch": 0.4785333333333333, + "grad_norm": 9.349699974060059, + "kl": 0.23681640625, + "learning_rate": 7.607333333333334e-07, + "loss": 0.0095, + "reward": 1.1875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 3589 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.0625, + "epoch": 0.4786666666666667, + "grad_norm": 8.20787239074707, + "kl": 0.18896484375, + "learning_rate": 7.606666666666667e-07, + "loss": 0.0076, + "reward": 1.1875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 3590 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.125, + "epoch": 0.4788, + "grad_norm": 9.81746768951416, + "kl": 0.296875, + "learning_rate": 7.606e-07, + "loss": 0.0119, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3591 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.25, + "epoch": 0.4789333333333333, + "grad_norm": 10.293619155883789, + "kl": 0.34765625, + "learning_rate": 7.605333333333333e-07, + "loss": 0.0139, + "reward": 1.5625, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 3592 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.5, + "epoch": 0.47906666666666664, + "grad_norm": 7.851104259490967, + "kl": 0.17822265625, + "learning_rate": 7.604666666666666e-07, + "loss": 0.0071, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3593 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.5625, + "epoch": 0.4792, + "grad_norm": 8.235939025878906, + "kl": 0.21728515625, + "learning_rate": 7.604e-07, + "loss": 0.0087, + "reward": 1.4375, + "reward_std": 0.6739883720874786, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 3594 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.875, + "epoch": 0.47933333333333333, + "grad_norm": 8.532655715942383, + "kl": 0.2060546875, + "learning_rate": 7.603333333333332e-07, + "loss": 0.0082, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3595 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.75, + "epoch": 0.47946666666666665, + "grad_norm": 9.334901809692383, + "kl": 0.2880859375, + "learning_rate": 7.602666666666666e-07, + "loss": 0.0115, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 3596 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.0625, + "epoch": 0.4796, + "grad_norm": 9.505949020385742, + "kl": 0.205078125, + "learning_rate": 7.601999999999999e-07, + "loss": 0.0082, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3597 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.3125, + "epoch": 0.47973333333333334, + "grad_norm": 7.853371620178223, + "kl": 0.23095703125, + "learning_rate": 7.601333333333333e-07, + "loss": 0.0092, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3598 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.3125, + "epoch": 0.47986666666666666, + "grad_norm": 7.550726890563965, + "kl": 0.25634765625, + "learning_rate": 7.600666666666666e-07, + "loss": 0.0103, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3599 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.4375, + "epoch": 0.48, + "grad_norm": 7.56726598739624, + "kl": 0.271484375, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0109, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3600 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.0625, + "epoch": 0.48013333333333336, + "grad_norm": 7.487982273101807, + "kl": 0.2685546875, + "learning_rate": 7.599333333333333e-07, + "loss": 0.0107, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3601 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.3125, + "epoch": 0.4802666666666667, + "grad_norm": 21.78593635559082, + "kl": 0.2890625, + "learning_rate": 7.598666666666666e-07, + "loss": 0.0116, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3602 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.875, + "epoch": 0.4804, + "grad_norm": 5.328373908996582, + "kl": 0.2958984375, + "learning_rate": 7.598e-07, + "loss": 0.0119, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 3603 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.5625, + "epoch": 0.4805333333333333, + "grad_norm": 9.402114868164062, + "kl": 0.1884765625, + "learning_rate": 7.597333333333333e-07, + "loss": 0.0075, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3604 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.0, + "epoch": 0.4806666666666667, + "grad_norm": 14.650140762329102, + "kl": 0.736328125, + "learning_rate": 7.596666666666667e-07, + "loss": 0.0294, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3605 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.5625, + "epoch": 0.4808, + "grad_norm": 4.413546085357666, + "kl": 0.18896484375, + "learning_rate": 7.596e-07, + "loss": 0.0076, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3606 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.9375, + "epoch": 0.4809333333333333, + "grad_norm": 3.763806104660034, + "kl": 0.21728515625, + "learning_rate": 7.595333333333333e-07, + "loss": 0.0087, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3607 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.6875, + "epoch": 0.48106666666666664, + "grad_norm": 8.204171180725098, + "kl": 0.23486328125, + "learning_rate": 7.594666666666666e-07, + "loss": 0.0094, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3608 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.5, + "epoch": 0.4812, + "grad_norm": 8.293075561523438, + "kl": 0.1904296875, + "learning_rate": 7.593999999999999e-07, + "loss": 0.0076, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3609 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.625, + "epoch": 0.48133333333333334, + "grad_norm": 5.498129844665527, + "kl": 0.2265625, + "learning_rate": 7.593333333333333e-07, + "loss": 0.0091, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3610 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.0625, + "epoch": 0.48146666666666665, + "grad_norm": 7.406184673309326, + "kl": 0.26953125, + "learning_rate": 7.592666666666666e-07, + "loss": 0.0108, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3611 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.625, + "epoch": 0.4816, + "grad_norm": 7.79302978515625, + "kl": 0.20703125, + "learning_rate": 7.592e-07, + "loss": 0.0083, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3612 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.125, + "epoch": 0.48173333333333335, + "grad_norm": 3.7575573921203613, + "kl": 0.22216796875, + "learning_rate": 7.591333333333333e-07, + "loss": 0.0089, + "reward": 1.4375, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 3613 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.5625, + "epoch": 0.48186666666666667, + "grad_norm": 5.912720203399658, + "kl": 0.27294921875, + "learning_rate": 7.590666666666667e-07, + "loss": 0.0109, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3614 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.625, + "epoch": 0.482, + "grad_norm": 102.61540985107422, + "kl": 0.23291015625, + "learning_rate": 7.59e-07, + "loss": 0.0093, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3615 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.8125, + "epoch": 0.48213333333333336, + "grad_norm": 4.985191345214844, + "kl": 0.3388671875, + "learning_rate": 7.589333333333334e-07, + "loss": 0.0136, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3616 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.1875, + "epoch": 0.4822666666666667, + "grad_norm": 6.173102378845215, + "kl": 0.2763671875, + "learning_rate": 7.588666666666666e-07, + "loss": 0.011, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3617 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.4375, + "epoch": 0.4824, + "grad_norm": 5.360503673553467, + "kl": 0.205078125, + "learning_rate": 7.588e-07, + "loss": 0.0082, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3618 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.875, + "epoch": 0.4825333333333333, + "grad_norm": 7.495425701141357, + "kl": 0.353515625, + "learning_rate": 7.587333333333333e-07, + "loss": 0.0141, + "reward": 1.625, + "reward_std": 0.6348394006490707, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 3619 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.9375, + "epoch": 0.4826666666666667, + "grad_norm": 4.130828857421875, + "kl": 0.24951171875, + "learning_rate": 7.586666666666666e-07, + "loss": 0.01, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 3620 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.25, + "epoch": 0.4828, + "grad_norm": 8.299042701721191, + "kl": 0.4111328125, + "learning_rate": 7.586e-07, + "loss": 0.0165, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3621 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.375, + "epoch": 0.4829333333333333, + "grad_norm": 9.656254768371582, + "kl": 0.3134765625, + "learning_rate": 7.585333333333332e-07, + "loss": 0.0125, + "reward": 1.625, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 3622 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.25, + "epoch": 0.48306666666666664, + "grad_norm": 9.058772087097168, + "kl": 0.2939453125, + "learning_rate": 7.584666666666666e-07, + "loss": 0.0117, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3623 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.375, + "epoch": 0.4832, + "grad_norm": 13.385004043579102, + "kl": 0.25341796875, + "learning_rate": 7.583999999999999e-07, + "loss": 0.0101, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 3624 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.6875, + "epoch": 0.48333333333333334, + "grad_norm": 4.719457149505615, + "kl": 0.25341796875, + "learning_rate": 7.583333333333333e-07, + "loss": 0.0102, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3625 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.5625, + "epoch": 0.48346666666666666, + "grad_norm": 15.09950065612793, + "kl": 0.3427734375, + "learning_rate": 7.582666666666666e-07, + "loss": 0.0137, + "reward": 1.625, + "reward_std": 0.6348394006490707, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 3626 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.9375, + "epoch": 0.4836, + "grad_norm": 0.47380414605140686, + "kl": 0.2138671875, + "learning_rate": 7.582e-07, + "loss": 0.0086, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3627 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.25, + "epoch": 0.48373333333333335, + "grad_norm": 12.672136306762695, + "kl": 0.55859375, + "learning_rate": 7.581333333333333e-07, + "loss": 0.0224, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.8125, + "step": 3628 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.9375, + "epoch": 0.48386666666666667, + "grad_norm": 0.8125580549240112, + "kl": 0.2734375, + "learning_rate": 7.580666666666666e-07, + "loss": 0.0109, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3629 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.75, + "epoch": 0.484, + "grad_norm": 9.287019729614258, + "kl": 0.349609375, + "learning_rate": 7.58e-07, + "loss": 0.014, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3630 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.0625, + "epoch": 0.48413333333333336, + "grad_norm": 6.232975959777832, + "kl": 0.2548828125, + "learning_rate": 7.579333333333333e-07, + "loss": 0.0102, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 3631 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.875, + "epoch": 0.4842666666666667, + "grad_norm": 8.135380744934082, + "kl": 0.2724609375, + "learning_rate": 7.578666666666667e-07, + "loss": 0.0109, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3632 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.875, + "epoch": 0.4844, + "grad_norm": 9.230762481689453, + "kl": 0.330078125, + "learning_rate": 7.578e-07, + "loss": 0.0132, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3633 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.6875, + "epoch": 0.4845333333333333, + "grad_norm": 5.498730659484863, + "kl": 0.23095703125, + "learning_rate": 7.577333333333334e-07, + "loss": 0.0092, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3634 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.5, + "epoch": 0.4846666666666667, + "grad_norm": 8.845742225646973, + "kl": 0.21875, + "learning_rate": 7.576666666666667e-07, + "loss": 0.0087, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3635 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.375, + "epoch": 0.4848, + "grad_norm": 7.035221099853516, + "kl": 0.3798828125, + "learning_rate": 7.576000000000001e-07, + "loss": 0.0152, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 3636 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.0, + "epoch": 0.4849333333333333, + "grad_norm": 8.522326469421387, + "kl": 0.27197265625, + "learning_rate": 7.575333333333332e-07, + "loss": 0.0109, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3637 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.0, + "epoch": 0.48506666666666665, + "grad_norm": 9.098684310913086, + "kl": 0.236328125, + "learning_rate": 7.574666666666665e-07, + "loss": 0.0094, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3638 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.125, + "epoch": 0.4852, + "grad_norm": 10.706013679504395, + "kl": 0.359375, + "learning_rate": 7.573999999999999e-07, + "loss": 0.0144, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 3639 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.1875, + "epoch": 0.48533333333333334, + "grad_norm": 7.917548656463623, + "kl": 0.33154296875, + "learning_rate": 7.573333333333332e-07, + "loss": 0.0133, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 3640 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.1875, + "epoch": 0.48546666666666666, + "grad_norm": 5.215019702911377, + "kl": 0.24365234375, + "learning_rate": 7.572666666666666e-07, + "loss": 0.0098, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3641 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.9375, + "epoch": 0.4856, + "grad_norm": 5.883626937866211, + "kl": 0.291015625, + "learning_rate": 7.571999999999999e-07, + "loss": 0.0116, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3642 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.6875, + "epoch": 0.48573333333333335, + "grad_norm": 10.61046028137207, + "kl": 0.2880859375, + "learning_rate": 7.571333333333333e-07, + "loss": 0.0115, + "reward": 1.6875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 3643 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.4375, + "epoch": 0.48586666666666667, + "grad_norm": 5.346075534820557, + "kl": 0.21484375, + "learning_rate": 7.570666666666666e-07, + "loss": 0.0086, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3644 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.8125, + "epoch": 0.486, + "grad_norm": 8.654717445373535, + "kl": 0.22607421875, + "learning_rate": 7.57e-07, + "loss": 0.0091, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3645 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.125, + "epoch": 0.4861333333333333, + "grad_norm": 9.029638290405273, + "kl": 0.328125, + "learning_rate": 7.569333333333333e-07, + "loss": 0.0132, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 3646 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.875, + "epoch": 0.4862666666666667, + "grad_norm": 0.4541252553462982, + "kl": 0.302734375, + "learning_rate": 7.568666666666666e-07, + "loss": 0.0121, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3647 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.875, + "epoch": 0.4864, + "grad_norm": 9.469585418701172, + "kl": 0.3994140625, + "learning_rate": 7.568e-07, + "loss": 0.016, + "reward": 1.6875, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 3648 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.375, + "epoch": 0.4865333333333333, + "grad_norm": 0.44807571172714233, + "kl": 0.26171875, + "learning_rate": 7.567333333333333e-07, + "loss": 0.0105, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3649 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.5, + "epoch": 0.4866666666666667, + "grad_norm": 7.304057598114014, + "kl": 0.3466796875, + "learning_rate": 7.566666666666667e-07, + "loss": 0.0138, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3650 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.9375, + "epoch": 0.4868, + "grad_norm": 8.986790657043457, + "kl": 0.3955078125, + "learning_rate": 7.566e-07, + "loss": 0.0158, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3651 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.8125, + "epoch": 0.48693333333333333, + "grad_norm": 8.269843101501465, + "kl": 0.1845703125, + "learning_rate": 7.565333333333333e-07, + "loss": 0.0074, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 3652 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.0625, + "epoch": 0.48706666666666665, + "grad_norm": 10.011322021484375, + "kl": 0.2470703125, + "learning_rate": 7.564666666666666e-07, + "loss": 0.0099, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3653 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.3125, + "epoch": 0.4872, + "grad_norm": 8.471098899841309, + "kl": 0.29150390625, + "learning_rate": 7.564e-07, + "loss": 0.0117, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3654 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.6875, + "epoch": 0.48733333333333334, + "grad_norm": 0.5038418769836426, + "kl": 0.23876953125, + "learning_rate": 7.563333333333333e-07, + "loss": 0.0095, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3655 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.875, + "epoch": 0.48746666666666666, + "grad_norm": 10.439891815185547, + "kl": 0.396484375, + "learning_rate": 7.562666666666666e-07, + "loss": 0.0159, + "reward": 1.6875, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.875, + "step": 3656 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.8125, + "epoch": 0.4876, + "grad_norm": 6.36724328994751, + "kl": 0.3037109375, + "learning_rate": 7.562e-07, + "loss": 0.0122, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3657 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.9375, + "epoch": 0.48773333333333335, + "grad_norm": 7.4928107261657715, + "kl": 0.255859375, + "learning_rate": 7.561333333333332e-07, + "loss": 0.0102, + "reward": 1.3125, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 3658 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.375, + "epoch": 0.48786666666666667, + "grad_norm": 7.952850818634033, + "kl": 0.3759765625, + "learning_rate": 7.560666666666666e-07, + "loss": 0.015, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3659 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.5625, + "epoch": 0.488, + "grad_norm": 4.624041557312012, + "kl": 0.24560546875, + "learning_rate": 7.559999999999999e-07, + "loss": 0.0098, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3660 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.9375, + "epoch": 0.4881333333333333, + "grad_norm": 6.495771408081055, + "kl": 0.2861328125, + "learning_rate": 7.559333333333333e-07, + "loss": 0.0115, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 3661 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.1875, + "epoch": 0.4882666666666667, + "grad_norm": 6.491541862487793, + "kl": 0.22314453125, + "learning_rate": 7.558666666666666e-07, + "loss": 0.0089, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3662 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.4375, + "epoch": 0.4884, + "grad_norm": 0.30022162199020386, + "kl": 0.19970703125, + "learning_rate": 7.558e-07, + "loss": 0.008, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3663 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.6875, + "epoch": 0.4885333333333333, + "grad_norm": 5.350852012634277, + "kl": 0.2001953125, + "learning_rate": 7.557333333333333e-07, + "loss": 0.008, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3664 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.9375, + "epoch": 0.4886666666666667, + "grad_norm": 0.5471274852752686, + "kl": 0.197265625, + "learning_rate": 7.556666666666667e-07, + "loss": 0.0079, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3665 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.3125, + "epoch": 0.4888, + "grad_norm": 7.671535968780518, + "kl": 0.17041015625, + "learning_rate": 7.556e-07, + "loss": 0.0068, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3666 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.0625, + "epoch": 0.48893333333333333, + "grad_norm": 7.1843156814575195, + "kl": 0.20068359375, + "learning_rate": 7.555333333333332e-07, + "loss": 0.008, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3667 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.125, + "epoch": 0.48906666666666665, + "grad_norm": 18.59103775024414, + "kl": 0.22314453125, + "learning_rate": 7.554666666666666e-07, + "loss": 0.0089, + "reward": 1.3125, + "reward_std": 0.6983994543552399, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 3668 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.0625, + "epoch": 0.4892, + "grad_norm": 6.971684455871582, + "kl": 0.2412109375, + "learning_rate": 7.553999999999999e-07, + "loss": 0.0097, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3669 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.375, + "epoch": 0.48933333333333334, + "grad_norm": 8.821162223815918, + "kl": 0.25732421875, + "learning_rate": 7.553333333333333e-07, + "loss": 0.0103, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3670 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.0625, + "epoch": 0.48946666666666666, + "grad_norm": 12.362741470336914, + "kl": 0.404296875, + "learning_rate": 7.552666666666666e-07, + "loss": 0.0162, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 3671 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.0, + "epoch": 0.4896, + "grad_norm": 5.169884204864502, + "kl": 0.2705078125, + "learning_rate": 7.552e-07, + "loss": 0.0109, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3672 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.0, + "epoch": 0.48973333333333335, + "grad_norm": 5.406614303588867, + "kl": 0.25390625, + "learning_rate": 7.551333333333333e-07, + "loss": 0.0102, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3673 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.0625, + "epoch": 0.4898666666666667, + "grad_norm": 5.722196102142334, + "kl": 0.22900390625, + "learning_rate": 7.550666666666667e-07, + "loss": 0.0091, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 3674 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.5, + "epoch": 0.49, + "grad_norm": 13.47675895690918, + "kl": 0.18798828125, + "learning_rate": 7.55e-07, + "loss": 0.0075, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 3675 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.125, + "epoch": 0.4901333333333333, + "grad_norm": 8.464110374450684, + "kl": 0.27490234375, + "learning_rate": 7.549333333333333e-07, + "loss": 0.011, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 3676 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.625, + "epoch": 0.4902666666666667, + "grad_norm": 7.216304302215576, + "kl": 0.1796875, + "learning_rate": 7.548666666666667e-07, + "loss": 0.0072, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3677 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.9375, + "epoch": 0.4904, + "grad_norm": 5.561407566070557, + "kl": 0.2978515625, + "learning_rate": 7.548e-07, + "loss": 0.0119, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3678 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.875, + "epoch": 0.4905333333333333, + "grad_norm": 8.349078178405762, + "kl": 0.25927734375, + "learning_rate": 7.547333333333334e-07, + "loss": 0.0104, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3679 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.375, + "epoch": 0.49066666666666664, + "grad_norm": 7.1943511962890625, + "kl": 0.189453125, + "learning_rate": 7.546666666666666e-07, + "loss": 0.0076, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3680 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.8125, + "epoch": 0.4908, + "grad_norm": 6.4473557472229, + "kl": 0.24609375, + "learning_rate": 7.546e-07, + "loss": 0.0098, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 3681 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.5625, + "epoch": 0.49093333333333333, + "grad_norm": 9.197769165039062, + "kl": 0.22900390625, + "learning_rate": 7.545333333333332e-07, + "loss": 0.0092, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3682 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.625, + "epoch": 0.49106666666666665, + "grad_norm": 6.868714332580566, + "kl": 0.24951171875, + "learning_rate": 7.544666666666666e-07, + "loss": 0.01, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3683 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.0, + "epoch": 0.4912, + "grad_norm": 7.2760772705078125, + "kl": 0.28759765625, + "learning_rate": 7.543999999999999e-07, + "loss": 0.0115, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3684 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.125, + "epoch": 0.49133333333333334, + "grad_norm": 5.9551801681518555, + "kl": 0.2724609375, + "learning_rate": 7.543333333333332e-07, + "loss": 0.0109, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3685 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.5, + "epoch": 0.49146666666666666, + "grad_norm": 5.565146446228027, + "kl": 0.24560546875, + "learning_rate": 7.542666666666666e-07, + "loss": 0.0098, + "reward": 1.25, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 3686 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.5, + "epoch": 0.4916, + "grad_norm": 10.175074577331543, + "kl": 0.21728515625, + "learning_rate": 7.541999999999999e-07, + "loss": 0.0087, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3687 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.4375, + "epoch": 0.49173333333333336, + "grad_norm": 9.454981803894043, + "kl": 0.19482421875, + "learning_rate": 7.541333333333333e-07, + "loss": 0.0078, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3688 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.6875, + "epoch": 0.4918666666666667, + "grad_norm": 9.943422317504883, + "kl": 0.3203125, + "learning_rate": 7.540666666666666e-07, + "loss": 0.0128, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3689 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.0, + "epoch": 0.492, + "grad_norm": 4.743348598480225, + "kl": 0.167236328125, + "learning_rate": 7.54e-07, + "loss": 0.0067, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3690 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.5, + "epoch": 0.4921333333333333, + "grad_norm": 0.48767736554145813, + "kl": 0.275390625, + "learning_rate": 7.539333333333333e-07, + "loss": 0.011, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3691 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.0, + "epoch": 0.4922666666666667, + "grad_norm": 0.3664047122001648, + "kl": 0.22998046875, + "learning_rate": 7.538666666666667e-07, + "loss": 0.0092, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3692 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.875, + "epoch": 0.4924, + "grad_norm": 5.465327262878418, + "kl": 0.271484375, + "learning_rate": 7.538e-07, + "loss": 0.0109, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3693 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.6875, + "epoch": 0.4925333333333333, + "grad_norm": 3.456498146057129, + "kl": 0.1953125, + "learning_rate": 7.537333333333333e-07, + "loss": 0.0078, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3694 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.9375, + "epoch": 0.49266666666666664, + "grad_norm": 7.031943321228027, + "kl": 0.275390625, + "learning_rate": 7.536666666666667e-07, + "loss": 0.011, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3695 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.8125, + "epoch": 0.4928, + "grad_norm": 8.024020195007324, + "kl": 0.287109375, + "learning_rate": 7.536e-07, + "loss": 0.0115, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 3696 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.4375, + "epoch": 0.49293333333333333, + "grad_norm": 8.121230125427246, + "kl": 0.16259765625, + "learning_rate": 7.535333333333334e-07, + "loss": 0.0065, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3697 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.6875, + "epoch": 0.49306666666666665, + "grad_norm": 4.419188499450684, + "kl": 0.1806640625, + "learning_rate": 7.534666666666666e-07, + "loss": 0.0072, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3698 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.625, + "epoch": 0.4932, + "grad_norm": 7.186795711517334, + "kl": 0.32275390625, + "learning_rate": 7.534e-07, + "loss": 0.0129, + "reward": 1.1875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 3699 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.4375, + "epoch": 0.49333333333333335, + "grad_norm": 7.812525749206543, + "kl": 0.1806640625, + "learning_rate": 7.533333333333332e-07, + "loss": 0.0072, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 3700 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.75, + "epoch": 0.49346666666666666, + "grad_norm": 8.366377830505371, + "kl": 0.2958984375, + "learning_rate": 7.532666666666666e-07, + "loss": 0.0119, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3701 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.8125, + "epoch": 0.4936, + "grad_norm": 4.258826732635498, + "kl": 0.19482421875, + "learning_rate": 7.531999999999999e-07, + "loss": 0.0078, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3702 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.75, + "epoch": 0.49373333333333336, + "grad_norm": 5.192541599273682, + "kl": 0.16015625, + "learning_rate": 7.531333333333332e-07, + "loss": 0.0064, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3703 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.6875, + "epoch": 0.4938666666666667, + "grad_norm": 5.031310081481934, + "kl": 0.25, + "learning_rate": 7.530666666666666e-07, + "loss": 0.01, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3704 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.625, + "epoch": 0.494, + "grad_norm": 7.585578918457031, + "kl": 0.353515625, + "learning_rate": 7.529999999999999e-07, + "loss": 0.0141, + "reward": 1.4375, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3705 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.625, + "epoch": 0.4941333333333333, + "grad_norm": 4.706550121307373, + "kl": 0.1669921875, + "learning_rate": 7.529333333333333e-07, + "loss": 0.0067, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3706 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.875, + "epoch": 0.4942666666666667, + "grad_norm": 6.517086505889893, + "kl": 0.2216796875, + "learning_rate": 7.528666666666666e-07, + "loss": 0.0088, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3707 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.25, + "epoch": 0.4944, + "grad_norm": 8.94314956665039, + "kl": 0.2294921875, + "learning_rate": 7.528e-07, + "loss": 0.0092, + "reward": 1.125, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.875, + "step": 3708 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.1875, + "epoch": 0.4945333333333333, + "grad_norm": 11.208703994750977, + "kl": 0.29248046875, + "learning_rate": 7.527333333333333e-07, + "loss": 0.0117, + "reward": 1.0625, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 3709 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.3125, + "epoch": 0.49466666666666664, + "grad_norm": 0.3337799906730652, + "kl": 0.26953125, + "learning_rate": 7.526666666666667e-07, + "loss": 0.0108, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3710 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.75, + "epoch": 0.4948, + "grad_norm": 6.6566314697265625, + "kl": 0.21875, + "learning_rate": 7.526e-07, + "loss": 0.0088, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3711 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.75, + "epoch": 0.49493333333333334, + "grad_norm": 0.6028213500976562, + "kl": 0.2373046875, + "learning_rate": 7.525333333333334e-07, + "loss": 0.0095, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3712 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.5625, + "epoch": 0.49506666666666665, + "grad_norm": 6.873280048370361, + "kl": 0.271484375, + "learning_rate": 7.524666666666666e-07, + "loss": 0.0109, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3713 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.875, + "epoch": 0.4952, + "grad_norm": 0.4590761959552765, + "kl": 0.21826171875, + "learning_rate": 7.523999999999999e-07, + "loss": 0.0087, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3714 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.8125, + "epoch": 0.49533333333333335, + "grad_norm": 13.465612411499023, + "kl": 0.28466796875, + "learning_rate": 7.523333333333333e-07, + "loss": 0.0114, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 3715 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.0, + "epoch": 0.49546666666666667, + "grad_norm": 0.5673381090164185, + "kl": 0.322265625, + "learning_rate": 7.522666666666666e-07, + "loss": 0.0129, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 3716 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.0625, + "epoch": 0.4956, + "grad_norm": 6.709904670715332, + "kl": 0.25634765625, + "learning_rate": 7.522e-07, + "loss": 0.0103, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 3717 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.5625, + "epoch": 0.49573333333333336, + "grad_norm": 10.292941093444824, + "kl": 0.25634765625, + "learning_rate": 7.521333333333333e-07, + "loss": 0.0103, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 3718 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.1875, + "epoch": 0.4958666666666667, + "grad_norm": 11.404926300048828, + "kl": 0.3955078125, + "learning_rate": 7.520666666666667e-07, + "loss": 0.0158, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 3719 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.875, + "epoch": 0.496, + "grad_norm": 8.443865776062012, + "kl": 0.16796875, + "learning_rate": 7.52e-07, + "loss": 0.0067, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3720 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.875, + "epoch": 0.4961333333333333, + "grad_norm": 7.209661483764648, + "kl": 0.240234375, + "learning_rate": 7.519333333333334e-07, + "loss": 0.0096, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3721 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.25, + "epoch": 0.4962666666666667, + "grad_norm": 7.452376365661621, + "kl": 0.2119140625, + "learning_rate": 7.518666666666666e-07, + "loss": 0.0085, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3722 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.75, + "epoch": 0.4964, + "grad_norm": 11.599651336669922, + "kl": 0.3193359375, + "learning_rate": 7.517999999999999e-07, + "loss": 0.0128, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 3723 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.5625, + "epoch": 0.4965333333333333, + "grad_norm": 13.536941528320312, + "kl": 0.2490234375, + "learning_rate": 7.517333333333333e-07, + "loss": 0.01, + "reward": 1.1875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 3724 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.8125, + "epoch": 0.49666666666666665, + "grad_norm": 12.575098037719727, + "kl": 0.21875, + "learning_rate": 7.516666666666666e-07, + "loss": 0.0088, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3725 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.75, + "epoch": 0.4968, + "grad_norm": 8.908465385437012, + "kl": 0.3974609375, + "learning_rate": 7.516e-07, + "loss": 0.0159, + "reward": 1.4375, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 3726 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.5625, + "epoch": 0.49693333333333334, + "grad_norm": 6.43015193939209, + "kl": 0.2080078125, + "learning_rate": 7.515333333333333e-07, + "loss": 0.0083, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3727 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.375, + "epoch": 0.49706666666666666, + "grad_norm": 7.422156810760498, + "kl": 0.3046875, + "learning_rate": 7.514666666666666e-07, + "loss": 0.0122, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3728 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.9375, + "epoch": 0.4972, + "grad_norm": 10.847439765930176, + "kl": 0.27685546875, + "learning_rate": 7.513999999999999e-07, + "loss": 0.0111, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 3729 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.5625, + "epoch": 0.49733333333333335, + "grad_norm": 86.8962173461914, + "kl": 0.4931640625, + "learning_rate": 7.513333333333333e-07, + "loss": 0.0198, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3730 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.9375, + "epoch": 0.49746666666666667, + "grad_norm": 6.143075942993164, + "kl": 0.4658203125, + "learning_rate": 7.512666666666666e-07, + "loss": 0.0186, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3731 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.5, + "epoch": 0.4976, + "grad_norm": 7.6775288581848145, + "kl": 0.25341796875, + "learning_rate": 7.511999999999999e-07, + "loss": 0.0101, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3732 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.8125, + "epoch": 0.49773333333333336, + "grad_norm": 11.954809188842773, + "kl": 0.2724609375, + "learning_rate": 7.511333333333333e-07, + "loss": 0.0109, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3733 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.4375, + "epoch": 0.4978666666666667, + "grad_norm": 16.978317260742188, + "kl": 0.462890625, + "learning_rate": 7.510666666666666e-07, + "loss": 0.0185, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3734 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.4375, + "epoch": 0.498, + "grad_norm": 17.06437873840332, + "kl": 0.4130859375, + "learning_rate": 7.51e-07, + "loss": 0.0165, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 3735 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.25, + "epoch": 0.4981333333333333, + "grad_norm": 7.193946361541748, + "kl": 0.41796875, + "learning_rate": 7.509333333333333e-07, + "loss": 0.0167, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3736 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.8125, + "epoch": 0.4982666666666667, + "grad_norm": 6.3679728507995605, + "kl": 0.359375, + "learning_rate": 7.508666666666667e-07, + "loss": 0.0144, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3737 + }, + { + "clip_ratio": 0.0, + "completion_length": 52.6875, + "epoch": 0.4984, + "grad_norm": 10.827320098876953, + "kl": 0.564453125, + "learning_rate": 7.508e-07, + "loss": 0.0226, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3738 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.0, + "epoch": 0.49853333333333333, + "grad_norm": 12.348599433898926, + "kl": 0.4306640625, + "learning_rate": 7.507333333333334e-07, + "loss": 0.0172, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3739 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.375, + "epoch": 0.49866666666666665, + "grad_norm": 8.425392150878906, + "kl": 0.30517578125, + "learning_rate": 7.506666666666667e-07, + "loss": 0.0122, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3740 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.25, + "epoch": 0.4988, + "grad_norm": 13.48202133178711, + "kl": 0.42919921875, + "learning_rate": 7.506e-07, + "loss": 0.0172, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3741 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.75, + "epoch": 0.49893333333333334, + "grad_norm": 9.517391204833984, + "kl": 0.38671875, + "learning_rate": 7.505333333333334e-07, + "loss": 0.0155, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3742 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.75, + "epoch": 0.49906666666666666, + "grad_norm": 9.681672096252441, + "kl": 0.47265625, + "learning_rate": 7.504666666666665e-07, + "loss": 0.0189, + "reward": 1.6875, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 3743 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.5, + "epoch": 0.4992, + "grad_norm": 4.221128463745117, + "kl": 0.20263671875, + "learning_rate": 7.503999999999999e-07, + "loss": 0.0081, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 3744 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.1875, + "epoch": 0.49933333333333335, + "grad_norm": 0.6174949407577515, + "kl": 0.38671875, + "learning_rate": 7.503333333333332e-07, + "loss": 0.0155, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3745 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.0, + "epoch": 0.49946666666666667, + "grad_norm": 10.550004005432129, + "kl": 0.564453125, + "learning_rate": 7.502666666666666e-07, + "loss": 0.0226, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3746 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.5, + "epoch": 0.4996, + "grad_norm": 7.89240837097168, + "kl": 0.5205078125, + "learning_rate": 7.501999999999999e-07, + "loss": 0.0209, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3747 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.25, + "epoch": 0.4997333333333333, + "grad_norm": 11.424846649169922, + "kl": 0.4833984375, + "learning_rate": 7.501333333333333e-07, + "loss": 0.0194, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3748 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.1875, + "epoch": 0.4998666666666667, + "grad_norm": 8.336617469787598, + "kl": 0.25244140625, + "learning_rate": 7.500666666666666e-07, + "loss": 0.0101, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3749 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.25, + "epoch": 0.5, + "grad_norm": 9.081232070922852, + "kl": 0.357421875, + "learning_rate": 7.5e-07, + "loss": 0.0143, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3750 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.6875, + "epoch": 0.5001333333333333, + "grad_norm": 6.523933410644531, + "kl": 0.3134765625, + "learning_rate": 7.499333333333333e-07, + "loss": 0.0125, + "reward": 1.125, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 3751 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.5625, + "epoch": 0.5002666666666666, + "grad_norm": 5.1809515953063965, + "kl": 0.31494140625, + "learning_rate": 7.498666666666666e-07, + "loss": 0.0126, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3752 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.5, + "epoch": 0.5004, + "grad_norm": 6.568466663360596, + "kl": 0.4853515625, + "learning_rate": 7.498e-07, + "loss": 0.0194, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 3753 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.9375, + "epoch": 0.5005333333333334, + "grad_norm": 11.198577880859375, + "kl": 0.3955078125, + "learning_rate": 7.497333333333333e-07, + "loss": 0.0158, + "reward": 1.6875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 3754 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.0625, + "epoch": 0.5006666666666667, + "grad_norm": 6.911953926086426, + "kl": 0.4208984375, + "learning_rate": 7.496666666666667e-07, + "loss": 0.0168, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3755 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.9375, + "epoch": 0.5008, + "grad_norm": 9.873940467834473, + "kl": 0.36962890625, + "learning_rate": 7.496e-07, + "loss": 0.0148, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 3756 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.125, + "epoch": 0.5009333333333333, + "grad_norm": 7.464991569519043, + "kl": 0.2919921875, + "learning_rate": 7.495333333333334e-07, + "loss": 0.0117, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3757 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.6875, + "epoch": 0.5010666666666667, + "grad_norm": 7.204183578491211, + "kl": 0.43994140625, + "learning_rate": 7.494666666666666e-07, + "loss": 0.0176, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3758 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.25, + "epoch": 0.5012, + "grad_norm": 5.636521339416504, + "kl": 0.3349609375, + "learning_rate": 7.494e-07, + "loss": 0.0134, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3759 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.4375, + "epoch": 0.5013333333333333, + "grad_norm": 57.07334518432617, + "kl": 3.8046875, + "learning_rate": 7.493333333333333e-07, + "loss": 0.152, + "reward": 1.625, + "reward_std": 0.816463440656662, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.8125, + "step": 3760 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.875, + "epoch": 0.5014666666666666, + "grad_norm": 10.43874740600586, + "kl": 0.330078125, + "learning_rate": 7.492666666666666e-07, + "loss": 0.0132, + "reward": 1.1875, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 3761 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.25, + "epoch": 0.5016, + "grad_norm": 0.4852445721626282, + "kl": 0.4794921875, + "learning_rate": 7.492e-07, + "loss": 0.0192, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3762 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.5, + "epoch": 0.5017333333333334, + "grad_norm": 5.412662029266357, + "kl": 0.34375, + "learning_rate": 7.491333333333333e-07, + "loss": 0.0137, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3763 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.5625, + "epoch": 0.5018666666666667, + "grad_norm": 6.768702030181885, + "kl": 0.361328125, + "learning_rate": 7.490666666666667e-07, + "loss": 0.0145, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3764 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.0, + "epoch": 0.502, + "grad_norm": 9.251935005187988, + "kl": 0.55078125, + "learning_rate": 7.489999999999999e-07, + "loss": 0.022, + "reward": 1.5625, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 3765 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.5625, + "epoch": 0.5021333333333333, + "grad_norm": 18.16510581970215, + "kl": 0.580078125, + "learning_rate": 7.489333333333333e-07, + "loss": 0.0232, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3766 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.0625, + "epoch": 0.5022666666666666, + "grad_norm": 17.36095428466797, + "kl": 0.4091796875, + "learning_rate": 7.488666666666666e-07, + "loss": 0.0163, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3767 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.5, + "epoch": 0.5024, + "grad_norm": 10.395658493041992, + "kl": 0.630859375, + "learning_rate": 7.488e-07, + "loss": 0.0252, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3768 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.75, + "epoch": 0.5025333333333334, + "grad_norm": 3.7506630420684814, + "kl": 0.4833984375, + "learning_rate": 7.487333333333333e-07, + "loss": 0.0194, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3769 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.9375, + "epoch": 0.5026666666666667, + "grad_norm": 9.467133522033691, + "kl": 0.6572265625, + "learning_rate": 7.486666666666666e-07, + "loss": 0.0263, + "reward": 1.3125, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 3770 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.375, + "epoch": 0.5028, + "grad_norm": 165.52101135253906, + "kl": 0.5322265625, + "learning_rate": 7.486e-07, + "loss": 0.0213, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 3771 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.0, + "epoch": 0.5029333333333333, + "grad_norm": 0.5565448999404907, + "kl": 0.31640625, + "learning_rate": 7.485333333333333e-07, + "loss": 0.0126, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3772 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.0625, + "epoch": 0.5030666666666667, + "grad_norm": 10.889375686645508, + "kl": 0.40087890625, + "learning_rate": 7.484666666666666e-07, + "loss": 0.016, + "reward": 1.4375, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3773 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.75, + "epoch": 0.5032, + "grad_norm": 10.343093872070312, + "kl": 0.40234375, + "learning_rate": 7.483999999999999e-07, + "loss": 0.0161, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 3774 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.6875, + "epoch": 0.5033333333333333, + "grad_norm": 0.34085533022880554, + "kl": 0.30517578125, + "learning_rate": 7.483333333333333e-07, + "loss": 0.0122, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3775 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.1875, + "epoch": 0.5034666666666666, + "grad_norm": 16.365528106689453, + "kl": 0.791015625, + "learning_rate": 7.482666666666666e-07, + "loss": 0.0317, + "reward": 1.25, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 3776 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.0625, + "epoch": 0.5036, + "grad_norm": 4.695663928985596, + "kl": 0.2646484375, + "learning_rate": 7.482e-07, + "loss": 0.0106, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 3777 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.75, + "epoch": 0.5037333333333334, + "grad_norm": 0.7181025147438049, + "kl": 0.37646484375, + "learning_rate": 7.481333333333333e-07, + "loss": 0.0151, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 3778 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.0625, + "epoch": 0.5038666666666667, + "grad_norm": 0.4102362394332886, + "kl": 0.37744140625, + "learning_rate": 7.480666666666666e-07, + "loss": 0.0151, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3779 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.6875, + "epoch": 0.504, + "grad_norm": 5.673660755157471, + "kl": 0.4833984375, + "learning_rate": 7.48e-07, + "loss": 0.0193, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3780 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.375, + "epoch": 0.5041333333333333, + "grad_norm": 9.333434104919434, + "kl": 0.638671875, + "learning_rate": 7.479333333333333e-07, + "loss": 0.0256, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 3781 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.5, + "epoch": 0.5042666666666666, + "grad_norm": 15.122415542602539, + "kl": 0.572265625, + "learning_rate": 7.478666666666667e-07, + "loss": 0.0228, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 3782 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.8125, + "epoch": 0.5044, + "grad_norm": 8.267053604125977, + "kl": 0.25732421875, + "learning_rate": 7.478e-07, + "loss": 0.0103, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3783 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.0, + "epoch": 0.5045333333333333, + "grad_norm": 6.362896919250488, + "kl": 0.17529296875, + "learning_rate": 7.477333333333334e-07, + "loss": 0.007, + "reward": 1.1875, + "reward_std": 0.6199793070554733, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 3784 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.9375, + "epoch": 0.5046666666666667, + "grad_norm": 8.364811897277832, + "kl": 0.3388671875, + "learning_rate": 7.476666666666667e-07, + "loss": 0.0136, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 3785 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.5, + "epoch": 0.5048, + "grad_norm": 6.726675987243652, + "kl": 0.271484375, + "learning_rate": 7.476e-07, + "loss": 0.0109, + "reward": 1.375, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3786 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.6875, + "epoch": 0.5049333333333333, + "grad_norm": 1.603332281112671, + "kl": 0.203125, + "learning_rate": 7.475333333333333e-07, + "loss": 0.0081, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3787 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.75, + "epoch": 0.5050666666666667, + "grad_norm": 9.7391939163208, + "kl": 0.322265625, + "learning_rate": 7.474666666666665e-07, + "loss": 0.0129, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.875, + "step": 3788 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.625, + "epoch": 0.5052, + "grad_norm": 10.740483283996582, + "kl": 0.453125, + "learning_rate": 7.473999999999999e-07, + "loss": 0.0182, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 3789 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.5, + "epoch": 0.5053333333333333, + "grad_norm": 7.815906047821045, + "kl": 0.18896484375, + "learning_rate": 7.473333333333332e-07, + "loss": 0.0076, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3790 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.4375, + "epoch": 0.5054666666666666, + "grad_norm": 4.078420162200928, + "kl": 0.15185546875, + "learning_rate": 7.472666666666666e-07, + "loss": 0.0061, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3791 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.3125, + "epoch": 0.5056, + "grad_norm": 8.015056610107422, + "kl": 0.224609375, + "learning_rate": 7.471999999999999e-07, + "loss": 0.009, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3792 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.8125, + "epoch": 0.5057333333333334, + "grad_norm": 5.4120073318481445, + "kl": 0.20703125, + "learning_rate": 7.471333333333333e-07, + "loss": 0.0083, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3793 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.5, + "epoch": 0.5058666666666667, + "grad_norm": 11.512836456298828, + "kl": 0.5205078125, + "learning_rate": 7.470666666666666e-07, + "loss": 0.0208, + "reward": 1.5625, + "reward_std": 0.7499763667583466, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 3794 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.25, + "epoch": 0.506, + "grad_norm": 13.280411720275879, + "kl": 0.3271484375, + "learning_rate": 7.47e-07, + "loss": 0.0131, + "reward": 1.625, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 3795 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.375, + "epoch": 0.5061333333333333, + "grad_norm": 7.782837390899658, + "kl": 0.529296875, + "learning_rate": 7.469333333333333e-07, + "loss": 0.0212, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3796 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.25, + "epoch": 0.5062666666666666, + "grad_norm": 9.476655006408691, + "kl": 0.2578125, + "learning_rate": 7.468666666666667e-07, + "loss": 0.0103, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3797 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.375, + "epoch": 0.5064, + "grad_norm": 5.26491641998291, + "kl": 0.25927734375, + "learning_rate": 7.468e-07, + "loss": 0.0104, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3798 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.4375, + "epoch": 0.5065333333333333, + "grad_norm": 9.306981086730957, + "kl": 0.376953125, + "learning_rate": 7.467333333333333e-07, + "loss": 0.0151, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3799 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.25, + "epoch": 0.5066666666666667, + "grad_norm": 9.580947875976562, + "kl": 0.353515625, + "learning_rate": 7.466666666666667e-07, + "loss": 0.0141, + "reward": 1.125, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 3800 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.25, + "epoch": 0.5068, + "grad_norm": 10.838539123535156, + "kl": 0.3759765625, + "learning_rate": 7.466e-07, + "loss": 0.015, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3801 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.125, + "epoch": 0.5069333333333333, + "grad_norm": 15.673752784729004, + "kl": 0.4306640625, + "learning_rate": 7.465333333333334e-07, + "loss": 0.0172, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3802 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.25, + "epoch": 0.5070666666666667, + "grad_norm": 6.69983434677124, + "kl": 0.212890625, + "learning_rate": 7.464666666666666e-07, + "loss": 0.0085, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3803 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.5625, + "epoch": 0.5072, + "grad_norm": 1.290596842765808, + "kl": 0.32470703125, + "learning_rate": 7.464e-07, + "loss": 0.013, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3804 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.125, + "epoch": 0.5073333333333333, + "grad_norm": 10.3748197555542, + "kl": 0.3173828125, + "learning_rate": 7.463333333333333e-07, + "loss": 0.0127, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3805 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.6875, + "epoch": 0.5074666666666666, + "grad_norm": 6.885356426239014, + "kl": 0.2958984375, + "learning_rate": 7.462666666666667e-07, + "loss": 0.0118, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3806 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.125, + "epoch": 0.5076, + "grad_norm": 11.467658996582031, + "kl": 0.21240234375, + "learning_rate": 7.461999999999999e-07, + "loss": 0.0085, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3807 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.875, + "epoch": 0.5077333333333334, + "grad_norm": 5.706396579742432, + "kl": 0.2861328125, + "learning_rate": 7.461333333333332e-07, + "loss": 0.0114, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3808 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.875, + "epoch": 0.5078666666666667, + "grad_norm": 8.52243709564209, + "kl": 0.275390625, + "learning_rate": 7.460666666666666e-07, + "loss": 0.011, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3809 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.9375, + "epoch": 0.508, + "grad_norm": 11.94613265991211, + "kl": 0.30078125, + "learning_rate": 7.459999999999999e-07, + "loss": 0.012, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3810 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.0625, + "epoch": 0.5081333333333333, + "grad_norm": 0.5261600613594055, + "kl": 0.31396484375, + "learning_rate": 7.459333333333333e-07, + "loss": 0.0125, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3811 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.6875, + "epoch": 0.5082666666666666, + "grad_norm": 0.531721830368042, + "kl": 0.1845703125, + "learning_rate": 7.458666666666666e-07, + "loss": 0.0074, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3812 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.6875, + "epoch": 0.5084, + "grad_norm": 6.77358865737915, + "kl": 0.17529296875, + "learning_rate": 7.458e-07, + "loss": 0.007, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3813 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.625, + "epoch": 0.5085333333333333, + "grad_norm": 9.57474422454834, + "kl": 0.5107421875, + "learning_rate": 7.457333333333333e-07, + "loss": 0.0205, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3814 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.3125, + "epoch": 0.5086666666666667, + "grad_norm": 9.74692153930664, + "kl": 0.3056640625, + "learning_rate": 7.456666666666667e-07, + "loss": 0.0122, + "reward": 1.125, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.875, + "step": 3815 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.0, + "epoch": 0.5088, + "grad_norm": 8.906259536743164, + "kl": 0.4775390625, + "learning_rate": 7.456e-07, + "loss": 0.0191, + "reward": 1.4375, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3816 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.0, + "epoch": 0.5089333333333333, + "grad_norm": 7.736762046813965, + "kl": 0.18115234375, + "learning_rate": 7.455333333333333e-07, + "loss": 0.0073, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3817 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.25, + "epoch": 0.5090666666666667, + "grad_norm": 0.4084847569465637, + "kl": 0.24853515625, + "learning_rate": 7.454666666666667e-07, + "loss": 0.0099, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3818 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.5625, + "epoch": 0.5092, + "grad_norm": 11.647789001464844, + "kl": 0.296875, + "learning_rate": 7.453999999999999e-07, + "loss": 0.0118, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3819 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.4375, + "epoch": 0.5093333333333333, + "grad_norm": 7.831934452056885, + "kl": 0.2626953125, + "learning_rate": 7.453333333333333e-07, + "loss": 0.0105, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 3820 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.0, + "epoch": 0.5094666666666666, + "grad_norm": 8.619487762451172, + "kl": 0.2978515625, + "learning_rate": 7.452666666666666e-07, + "loss": 0.0119, + "reward": 1.0625, + "reward_std": 0.6396867483854294, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.8125, + "step": 3821 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.75, + "epoch": 0.5096, + "grad_norm": 25.268455505371094, + "kl": 0.9296875, + "learning_rate": 7.452e-07, + "loss": 0.0371, + "reward": 1.1875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 3822 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.3125, + "epoch": 0.5097333333333334, + "grad_norm": 5.166355609893799, + "kl": 0.29345703125, + "learning_rate": 7.451333333333333e-07, + "loss": 0.0117, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 3823 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.375, + "epoch": 0.5098666666666667, + "grad_norm": 8.180623054504395, + "kl": 0.15771484375, + "learning_rate": 7.450666666666667e-07, + "loss": 0.0063, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3824 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.6875, + "epoch": 0.51, + "grad_norm": 13.686110496520996, + "kl": 0.3310546875, + "learning_rate": 7.45e-07, + "loss": 0.0133, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 3825 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.875, + "epoch": 0.5101333333333333, + "grad_norm": 5.282297134399414, + "kl": 0.21435546875, + "learning_rate": 7.449333333333333e-07, + "loss": 0.0086, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3826 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.625, + "epoch": 0.5102666666666666, + "grad_norm": 6.087067604064941, + "kl": 0.18505859375, + "learning_rate": 7.448666666666667e-07, + "loss": 0.0074, + "reward": 1.25, + "reward_std": 0.6746576428413391, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 3827 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.0, + "epoch": 0.5104, + "grad_norm": 5.623158931732178, + "kl": 0.1982421875, + "learning_rate": 7.447999999999999e-07, + "loss": 0.0079, + "reward": 1.1875, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 3828 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.625, + "epoch": 0.5105333333333333, + "grad_norm": 9.969017028808594, + "kl": 0.22314453125, + "learning_rate": 7.447333333333333e-07, + "loss": 0.0089, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3829 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.3125, + "epoch": 0.5106666666666667, + "grad_norm": 1.1304469108581543, + "kl": 0.234375, + "learning_rate": 7.446666666666666e-07, + "loss": 0.0094, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3830 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.4375, + "epoch": 0.5108, + "grad_norm": 7.213038921356201, + "kl": 0.20458984375, + "learning_rate": 7.446e-07, + "loss": 0.0082, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 3831 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.3125, + "epoch": 0.5109333333333334, + "grad_norm": 8.971359252929688, + "kl": 0.2568359375, + "learning_rate": 7.445333333333333e-07, + "loss": 0.0103, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3832 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.625, + "epoch": 0.5110666666666667, + "grad_norm": 6.529086112976074, + "kl": 0.466796875, + "learning_rate": 7.444666666666667e-07, + "loss": 0.0186, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 3833 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.8125, + "epoch": 0.5112, + "grad_norm": 6.061822891235352, + "kl": 0.20751953125, + "learning_rate": 7.443999999999999e-07, + "loss": 0.0083, + "reward": 0.9375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.875, + "step": 3834 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.125, + "epoch": 0.5113333333333333, + "grad_norm": 6.128164768218994, + "kl": 0.4208984375, + "learning_rate": 7.443333333333332e-07, + "loss": 0.0168, + "reward": 1.5625, + "reward_std": 0.4172614812850952, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 3835 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.6875, + "epoch": 0.5114666666666666, + "grad_norm": 8.083436965942383, + "kl": 0.31103515625, + "learning_rate": 7.442666666666666e-07, + "loss": 0.0124, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3836 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.3125, + "epoch": 0.5116, + "grad_norm": 9.938801765441895, + "kl": 0.27978515625, + "learning_rate": 7.441999999999999e-07, + "loss": 0.0112, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 3837 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.3125, + "epoch": 0.5117333333333334, + "grad_norm": 11.54384994506836, + "kl": 0.2890625, + "learning_rate": 7.441333333333333e-07, + "loss": 0.0116, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 3838 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.8125, + "epoch": 0.5118666666666667, + "grad_norm": 10.172381401062012, + "kl": 0.3173828125, + "learning_rate": 7.440666666666666e-07, + "loss": 0.0127, + "reward": 1.375, + "reward_std": 0.7168372869491577, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 3839 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.0625, + "epoch": 0.512, + "grad_norm": 5.726608753204346, + "kl": 0.2109375, + "learning_rate": 7.44e-07, + "loss": 0.0084, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3840 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.9375, + "epoch": 0.5121333333333333, + "grad_norm": 4.934794902801514, + "kl": 0.2724609375, + "learning_rate": 7.439333333333333e-07, + "loss": 0.0109, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 3841 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.0625, + "epoch": 0.5122666666666666, + "grad_norm": 8.750523567199707, + "kl": 0.28662109375, + "learning_rate": 7.438666666666667e-07, + "loss": 0.0115, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 3842 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.1875, + "epoch": 0.5124, + "grad_norm": 7.356362819671631, + "kl": 0.2509765625, + "learning_rate": 7.438e-07, + "loss": 0.0101, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 3843 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.4375, + "epoch": 0.5125333333333333, + "grad_norm": 11.5214262008667, + "kl": 0.2646484375, + "learning_rate": 7.437333333333334e-07, + "loss": 0.0106, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 3844 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.0625, + "epoch": 0.5126666666666667, + "grad_norm": 4.516272068023682, + "kl": 0.203125, + "learning_rate": 7.436666666666667e-07, + "loss": 0.0081, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3845 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.4375, + "epoch": 0.5128, + "grad_norm": 7.179525375366211, + "kl": 0.35693359375, + "learning_rate": 7.436e-07, + "loss": 0.0142, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.875, + "step": 3846 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.5, + "epoch": 0.5129333333333334, + "grad_norm": 8.17778491973877, + "kl": 0.28662109375, + "learning_rate": 7.435333333333334e-07, + "loss": 0.0114, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3847 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.8125, + "epoch": 0.5130666666666667, + "grad_norm": 6.426929950714111, + "kl": 0.25341796875, + "learning_rate": 7.434666666666667e-07, + "loss": 0.0101, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 3848 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.8125, + "epoch": 0.5132, + "grad_norm": 5.991567134857178, + "kl": 0.26171875, + "learning_rate": 7.433999999999999e-07, + "loss": 0.0105, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3849 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.125, + "epoch": 0.5133333333333333, + "grad_norm": 6.125587463378906, + "kl": 0.15966796875, + "learning_rate": 7.433333333333332e-07, + "loss": 0.0064, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3850 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.5, + "epoch": 0.5134666666666666, + "grad_norm": 0.40319183468818665, + "kl": 0.20166015625, + "learning_rate": 7.432666666666666e-07, + "loss": 0.0081, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3851 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.125, + "epoch": 0.5136, + "grad_norm": 14.535001754760742, + "kl": 0.75634765625, + "learning_rate": 7.431999999999999e-07, + "loss": 0.0304, + "reward": 1.375, + "reward_std": 0.6943650841712952, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 3852 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.0, + "epoch": 0.5137333333333334, + "grad_norm": 7.726195812225342, + "kl": 0.2421875, + "learning_rate": 7.431333333333333e-07, + "loss": 0.0097, + "reward": 1.625, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 3853 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.5625, + "epoch": 0.5138666666666667, + "grad_norm": 9.370970726013184, + "kl": 0.3564453125, + "learning_rate": 7.430666666666666e-07, + "loss": 0.0143, + "reward": 1.0625, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.875, + "step": 3854 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.9375, + "epoch": 0.514, + "grad_norm": 4.030291557312012, + "kl": 0.15966796875, + "learning_rate": 7.429999999999999e-07, + "loss": 0.0064, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3855 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.75, + "epoch": 0.5141333333333333, + "grad_norm": 5.054083824157715, + "kl": 0.236328125, + "learning_rate": 7.429333333333333e-07, + "loss": 0.0094, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3856 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.5625, + "epoch": 0.5142666666666666, + "grad_norm": 6.663233280181885, + "kl": 0.2900390625, + "learning_rate": 7.428666666666666e-07, + "loss": 0.0116, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3857 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.125, + "epoch": 0.5144, + "grad_norm": 4.5978498458862305, + "kl": 0.1572265625, + "learning_rate": 7.428e-07, + "loss": 0.0063, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3858 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.0625, + "epoch": 0.5145333333333333, + "grad_norm": 6.547774791717529, + "kl": 0.20849609375, + "learning_rate": 7.427333333333333e-07, + "loss": 0.0083, + "reward": 1.4375, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 3859 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.625, + "epoch": 0.5146666666666667, + "grad_norm": 5.6197590827941895, + "kl": 0.27490234375, + "learning_rate": 7.426666666666667e-07, + "loss": 0.011, + "reward": 1.0, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 3860 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.625, + "epoch": 0.5148, + "grad_norm": 9.058164596557617, + "kl": 0.1953125, + "learning_rate": 7.426e-07, + "loss": 0.0078, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3861 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.125, + "epoch": 0.5149333333333334, + "grad_norm": 3.843283176422119, + "kl": 0.20458984375, + "learning_rate": 7.425333333333334e-07, + "loss": 0.0082, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3862 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.0, + "epoch": 0.5150666666666667, + "grad_norm": 4.734989166259766, + "kl": 0.16357421875, + "learning_rate": 7.424666666666667e-07, + "loss": 0.0065, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3863 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.8125, + "epoch": 0.5152, + "grad_norm": 3.079455614089966, + "kl": 0.20458984375, + "learning_rate": 7.423999999999999e-07, + "loss": 0.0082, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3864 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.5, + "epoch": 0.5153333333333333, + "grad_norm": 5.484098434448242, + "kl": 0.22216796875, + "learning_rate": 7.423333333333333e-07, + "loss": 0.0089, + "reward": 1.25, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 3865 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.6875, + "epoch": 0.5154666666666666, + "grad_norm": 5.635770797729492, + "kl": 0.23779296875, + "learning_rate": 7.422666666666666e-07, + "loss": 0.0095, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3866 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.5625, + "epoch": 0.5156, + "grad_norm": 5.33425760269165, + "kl": 0.3056640625, + "learning_rate": 7.422e-07, + "loss": 0.0122, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3867 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.9375, + "epoch": 0.5157333333333334, + "grad_norm": 8.65149211883545, + "kl": 0.17822265625, + "learning_rate": 7.421333333333333e-07, + "loss": 0.0071, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 3868 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.0625, + "epoch": 0.5158666666666667, + "grad_norm": 13.628355026245117, + "kl": 0.2275390625, + "learning_rate": 7.420666666666667e-07, + "loss": 0.0091, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3869 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.6875, + "epoch": 0.516, + "grad_norm": 6.119382381439209, + "kl": 0.2314453125, + "learning_rate": 7.42e-07, + "loss": 0.0092, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3870 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.25, + "epoch": 0.5161333333333333, + "grad_norm": 0.4052983224391937, + "kl": 0.220703125, + "learning_rate": 7.419333333333333e-07, + "loss": 0.0088, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3871 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.3125, + "epoch": 0.5162666666666667, + "grad_norm": 0.5481399297714233, + "kl": 0.1865234375, + "learning_rate": 7.418666666666666e-07, + "loss": 0.0075, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3872 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.8125, + "epoch": 0.5164, + "grad_norm": 5.248465538024902, + "kl": 0.21337890625, + "learning_rate": 7.417999999999999e-07, + "loss": 0.0085, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3873 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.0625, + "epoch": 0.5165333333333333, + "grad_norm": 7.4687724113464355, + "kl": 0.47119140625, + "learning_rate": 7.417333333333333e-07, + "loss": 0.0188, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3874 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.9375, + "epoch": 0.5166666666666667, + "grad_norm": 7.2083740234375, + "kl": 0.2470703125, + "learning_rate": 7.416666666666666e-07, + "loss": 0.0099, + "reward": 1.375, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3875 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.1875, + "epoch": 0.5168, + "grad_norm": 110.26434326171875, + "kl": 0.193359375, + "learning_rate": 7.416e-07, + "loss": 0.0077, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 3876 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.1875, + "epoch": 0.5169333333333334, + "grad_norm": 6.212274551391602, + "kl": 0.2783203125, + "learning_rate": 7.415333333333333e-07, + "loss": 0.0111, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3877 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.75, + "epoch": 0.5170666666666667, + "grad_norm": 8.12020206451416, + "kl": 0.23876953125, + "learning_rate": 7.414666666666667e-07, + "loss": 0.0096, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 3878 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.4375, + "epoch": 0.5172, + "grad_norm": 8.246763229370117, + "kl": 0.21826171875, + "learning_rate": 7.413999999999999e-07, + "loss": 0.0087, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3879 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.375, + "epoch": 0.5173333333333333, + "grad_norm": 8.350295066833496, + "kl": 0.19287109375, + "learning_rate": 7.413333333333333e-07, + "loss": 0.0077, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3880 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.375, + "epoch": 0.5174666666666666, + "grad_norm": 3.8527138233184814, + "kl": 0.21826171875, + "learning_rate": 7.412666666666666e-07, + "loss": 0.0087, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3881 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.4375, + "epoch": 0.5176, + "grad_norm": 4.849356651306152, + "kl": 0.2177734375, + "learning_rate": 7.411999999999999e-07, + "loss": 0.0087, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3882 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.5, + "epoch": 0.5177333333333334, + "grad_norm": 0.35340937972068787, + "kl": 0.1669921875, + "learning_rate": 7.411333333333333e-07, + "loss": 0.0067, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3883 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.4375, + "epoch": 0.5178666666666667, + "grad_norm": 0.8525378704071045, + "kl": 0.27587890625, + "learning_rate": 7.410666666666666e-07, + "loss": 0.011, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3884 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.0625, + "epoch": 0.518, + "grad_norm": 7.655086040496826, + "kl": 0.27001953125, + "learning_rate": 7.41e-07, + "loss": 0.0108, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3885 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.8125, + "epoch": 0.5181333333333333, + "grad_norm": 13.51273250579834, + "kl": 0.3779296875, + "learning_rate": 7.409333333333333e-07, + "loss": 0.0151, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 3886 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.5, + "epoch": 0.5182666666666667, + "grad_norm": 7.26680850982666, + "kl": 0.19921875, + "learning_rate": 7.408666666666667e-07, + "loss": 0.008, + "reward": 1.1875, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 3887 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.875, + "epoch": 0.5184, + "grad_norm": 8.257951736450195, + "kl": 0.36962890625, + "learning_rate": 7.408e-07, + "loss": 0.0148, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3888 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.0, + "epoch": 0.5185333333333333, + "grad_norm": 5.364131450653076, + "kl": 0.1669921875, + "learning_rate": 7.407333333333334e-07, + "loss": 0.0067, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 3889 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.75, + "epoch": 0.5186666666666667, + "grad_norm": 7.019574165344238, + "kl": 0.2041015625, + "learning_rate": 7.406666666666667e-07, + "loss": 0.0082, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3890 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.25, + "epoch": 0.5188, + "grad_norm": 7.713319301605225, + "kl": 0.2392578125, + "learning_rate": 7.406000000000001e-07, + "loss": 0.0096, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3891 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.4375, + "epoch": 0.5189333333333334, + "grad_norm": 5.9241180419921875, + "kl": 0.13818359375, + "learning_rate": 7.405333333333333e-07, + "loss": 0.0055, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3892 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.6875, + "epoch": 0.5190666666666667, + "grad_norm": 4.951693058013916, + "kl": 0.197265625, + "learning_rate": 7.404666666666666e-07, + "loss": 0.0079, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 3893 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.125, + "epoch": 0.5192, + "grad_norm": 5.5705156326293945, + "kl": 0.2861328125, + "learning_rate": 7.403999999999999e-07, + "loss": 0.0115, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 3894 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.8125, + "epoch": 0.5193333333333333, + "grad_norm": 6.898040294647217, + "kl": 0.19140625, + "learning_rate": 7.403333333333332e-07, + "loss": 0.0076, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3895 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.5, + "epoch": 0.5194666666666666, + "grad_norm": 0.6751611828804016, + "kl": 0.2021484375, + "learning_rate": 7.402666666666666e-07, + "loss": 0.0081, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3896 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.75, + "epoch": 0.5196, + "grad_norm": 8.479080200195312, + "kl": 0.2841796875, + "learning_rate": 7.401999999999999e-07, + "loss": 0.0114, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3897 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.75, + "epoch": 0.5197333333333334, + "grad_norm": 4.752042770385742, + "kl": 0.1494140625, + "learning_rate": 7.401333333333333e-07, + "loss": 0.006, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3898 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.6875, + "epoch": 0.5198666666666667, + "grad_norm": 4.4535417556762695, + "kl": 0.17041015625, + "learning_rate": 7.400666666666666e-07, + "loss": 0.0068, + "reward": 1.1875, + "reward_std": 0.45806270837783813, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 3899 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.125, + "epoch": 0.52, + "grad_norm": 22.480554580688477, + "kl": 0.171875, + "learning_rate": 7.4e-07, + "loss": 0.0069, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3900 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.6875, + "epoch": 0.5201333333333333, + "grad_norm": 5.851747989654541, + "kl": 0.125732421875, + "learning_rate": 7.399333333333333e-07, + "loss": 0.005, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3901 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.3125, + "epoch": 0.5202666666666667, + "grad_norm": 6.066470623016357, + "kl": 0.166015625, + "learning_rate": 7.398666666666666e-07, + "loss": 0.0066, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3902 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.625, + "epoch": 0.5204, + "grad_norm": 9.170371055603027, + "kl": 0.2783203125, + "learning_rate": 7.398e-07, + "loss": 0.0111, + "reward": 1.4375, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3903 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.625, + "epoch": 0.5205333333333333, + "grad_norm": 9.00657844543457, + "kl": 0.5, + "learning_rate": 7.397333333333333e-07, + "loss": 0.0199, + "reward": 1.5625, + "reward_std": 0.7216846346855164, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 3904 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.875, + "epoch": 0.5206666666666667, + "grad_norm": 10.059743881225586, + "kl": 0.169921875, + "learning_rate": 7.396666666666667e-07, + "loss": 0.0068, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 3905 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.6875, + "epoch": 0.5208, + "grad_norm": 7.198017120361328, + "kl": 0.197265625, + "learning_rate": 7.396e-07, + "loss": 0.0079, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 3906 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.0, + "epoch": 0.5209333333333334, + "grad_norm": 6.100743293762207, + "kl": 0.22119140625, + "learning_rate": 7.395333333333334e-07, + "loss": 0.0089, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3907 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.0625, + "epoch": 0.5210666666666667, + "grad_norm": 15.60824966430664, + "kl": 0.26953125, + "learning_rate": 7.394666666666667e-07, + "loss": 0.0108, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3908 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.9375, + "epoch": 0.5212, + "grad_norm": 7.788787841796875, + "kl": 0.22802734375, + "learning_rate": 7.394e-07, + "loss": 0.0091, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3909 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.9375, + "epoch": 0.5213333333333333, + "grad_norm": 8.881121635437012, + "kl": 0.412109375, + "learning_rate": 7.393333333333333e-07, + "loss": 0.0165, + "reward": 1.4375, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 3910 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.375, + "epoch": 0.5214666666666666, + "grad_norm": 5.264603137969971, + "kl": 0.1943359375, + "learning_rate": 7.392666666666665e-07, + "loss": 0.0078, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 3911 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.125, + "epoch": 0.5216, + "grad_norm": 5.9249587059021, + "kl": 0.17822265625, + "learning_rate": 7.392e-07, + "loss": 0.0071, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 3912 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.5, + "epoch": 0.5217333333333334, + "grad_norm": 9.100008964538574, + "kl": 0.1982421875, + "learning_rate": 7.391333333333332e-07, + "loss": 0.0079, + "reward": 1.3125, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 3913 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.8125, + "epoch": 0.5218666666666667, + "grad_norm": 5.09943151473999, + "kl": 0.25732421875, + "learning_rate": 7.390666666666666e-07, + "loss": 0.0103, + "reward": 1.625, + "reward_std": 0.4432026147842407, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 3914 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.25, + "epoch": 0.522, + "grad_norm": 7.728481292724609, + "kl": 0.267578125, + "learning_rate": 7.389999999999999e-07, + "loss": 0.0107, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3915 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.125, + "epoch": 0.5221333333333333, + "grad_norm": 11.74528694152832, + "kl": 0.193359375, + "learning_rate": 7.389333333333333e-07, + "loss": 0.0078, + "reward": 1.3125, + "reward_std": 0.8349219560623169, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.75, + "step": 3916 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.25, + "epoch": 0.5222666666666667, + "grad_norm": 5.419708251953125, + "kl": 0.1640625, + "learning_rate": 7.388666666666666e-07, + "loss": 0.0066, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 3917 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.6875, + "epoch": 0.5224, + "grad_norm": 8.589099884033203, + "kl": 0.20654296875, + "learning_rate": 7.388e-07, + "loss": 0.0083, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3918 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.1875, + "epoch": 0.5225333333333333, + "grad_norm": 5.0146613121032715, + "kl": 0.25634765625, + "learning_rate": 7.387333333333333e-07, + "loss": 0.0102, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3919 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.375, + "epoch": 0.5226666666666666, + "grad_norm": 10.752886772155762, + "kl": 0.17333984375, + "learning_rate": 7.386666666666666e-07, + "loss": 0.0069, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3920 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.1875, + "epoch": 0.5228, + "grad_norm": 7.629311561584473, + "kl": 0.17431640625, + "learning_rate": 7.386e-07, + "loss": 0.007, + "reward": 1.5625, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 3921 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.375, + "epoch": 0.5229333333333334, + "grad_norm": 4.568310737609863, + "kl": 0.23046875, + "learning_rate": 7.385333333333333e-07, + "loss": 0.0092, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3922 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.375, + "epoch": 0.5230666666666667, + "grad_norm": 8.720492362976074, + "kl": 0.2783203125, + "learning_rate": 7.384666666666667e-07, + "loss": 0.0111, + "reward": 1.6875, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.875, + "step": 3923 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.5, + "epoch": 0.5232, + "grad_norm": 6.200765132904053, + "kl": 0.275390625, + "learning_rate": 7.383999999999999e-07, + "loss": 0.011, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3924 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.125, + "epoch": 0.5233333333333333, + "grad_norm": 0.36541467905044556, + "kl": 0.24755859375, + "learning_rate": 7.383333333333333e-07, + "loss": 0.0099, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3925 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.25, + "epoch": 0.5234666666666666, + "grad_norm": 46.72539520263672, + "kl": 0.41357421875, + "learning_rate": 7.382666666666666e-07, + "loss": 0.0166, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 3926 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.5, + "epoch": 0.5236, + "grad_norm": 8.792903900146484, + "kl": 0.18896484375, + "learning_rate": 7.382e-07, + "loss": 0.0075, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 3927 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.8125, + "epoch": 0.5237333333333334, + "grad_norm": 182.65512084960938, + "kl": 0.16064453125, + "learning_rate": 7.381333333333333e-07, + "loss": 0.0064, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3928 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.375, + "epoch": 0.5238666666666667, + "grad_norm": 10.401251792907715, + "kl": 0.21826171875, + "learning_rate": 7.380666666666666e-07, + "loss": 0.0087, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 3929 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.3125, + "epoch": 0.524, + "grad_norm": 7.928893089294434, + "kl": 0.20703125, + "learning_rate": 7.38e-07, + "loss": 0.0083, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3930 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.625, + "epoch": 0.5241333333333333, + "grad_norm": 5.9452433586120605, + "kl": 0.1572265625, + "learning_rate": 7.379333333333333e-07, + "loss": 0.0063, + "reward": 1.5625, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 3931 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.3125, + "epoch": 0.5242666666666667, + "grad_norm": 8.47170352935791, + "kl": 0.2099609375, + "learning_rate": 7.378666666666667e-07, + "loss": 0.0084, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 3932 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.6875, + "epoch": 0.5244, + "grad_norm": 6.020195960998535, + "kl": 0.29296875, + "learning_rate": 7.378e-07, + "loss": 0.0117, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 3933 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.125, + "epoch": 0.5245333333333333, + "grad_norm": 4.503012657165527, + "kl": 0.1748046875, + "learning_rate": 7.377333333333333e-07, + "loss": 0.007, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3934 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.9375, + "epoch": 0.5246666666666666, + "grad_norm": 19.68462562561035, + "kl": 0.1591796875, + "learning_rate": 7.376666666666666e-07, + "loss": 0.0064, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3935 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.125, + "epoch": 0.5248, + "grad_norm": 5.759716510772705, + "kl": 0.1787109375, + "learning_rate": 7.376e-07, + "loss": 0.0072, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 3936 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.4375, + "epoch": 0.5249333333333334, + "grad_norm": 5.052263259887695, + "kl": 0.150390625, + "learning_rate": 7.375333333333333e-07, + "loss": 0.006, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3937 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.0, + "epoch": 0.5250666666666667, + "grad_norm": 4.79433536529541, + "kl": 0.158203125, + "learning_rate": 7.374666666666667e-07, + "loss": 0.0063, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3938 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.375, + "epoch": 0.5252, + "grad_norm": 9.691505432128906, + "kl": 0.32080078125, + "learning_rate": 7.374e-07, + "loss": 0.0128, + "reward": 1.1875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 3939 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.0625, + "epoch": 0.5253333333333333, + "grad_norm": 7.648075103759766, + "kl": 0.2099609375, + "learning_rate": 7.373333333333332e-07, + "loss": 0.0084, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3940 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.25, + "epoch": 0.5254666666666666, + "grad_norm": 7.000082015991211, + "kl": 0.1962890625, + "learning_rate": 7.372666666666666e-07, + "loss": 0.0078, + "reward": 1.25, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 3941 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.875, + "epoch": 0.5256, + "grad_norm": 1.9417142868041992, + "kl": 0.18896484375, + "learning_rate": 7.371999999999999e-07, + "loss": 0.0076, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3942 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.8125, + "epoch": 0.5257333333333334, + "grad_norm": 3.795294761657715, + "kl": 0.13134765625, + "learning_rate": 7.371333333333333e-07, + "loss": 0.0052, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3943 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.5, + "epoch": 0.5258666666666667, + "grad_norm": 6.324313640594482, + "kl": 0.2939453125, + "learning_rate": 7.370666666666666e-07, + "loss": 0.0118, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3944 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.4375, + "epoch": 0.526, + "grad_norm": 7.702014446258545, + "kl": 0.2548828125, + "learning_rate": 7.37e-07, + "loss": 0.0102, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 3945 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.9375, + "epoch": 0.5261333333333333, + "grad_norm": 6.740260601043701, + "kl": 0.19970703125, + "learning_rate": 7.369333333333333e-07, + "loss": 0.008, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3946 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.75, + "epoch": 0.5262666666666667, + "grad_norm": 5.443835258483887, + "kl": 0.21826171875, + "learning_rate": 7.368666666666667e-07, + "loss": 0.0087, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3947 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.8125, + "epoch": 0.5264, + "grad_norm": 3.7337357997894287, + "kl": 0.1748046875, + "learning_rate": 7.368e-07, + "loss": 0.007, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3948 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.5, + "epoch": 0.5265333333333333, + "grad_norm": 7.127203464508057, + "kl": 0.177734375, + "learning_rate": 7.367333333333333e-07, + "loss": 0.0071, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3949 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.3125, + "epoch": 0.5266666666666666, + "grad_norm": 5.6135406494140625, + "kl": 0.22021484375, + "learning_rate": 7.366666666666667e-07, + "loss": 0.0088, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3950 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.5625, + "epoch": 0.5268, + "grad_norm": 7.7969746589660645, + "kl": 0.1796875, + "learning_rate": 7.366e-07, + "loss": 0.0072, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3951 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.5625, + "epoch": 0.5269333333333334, + "grad_norm": 7.089471817016602, + "kl": 0.1708984375, + "learning_rate": 7.365333333333334e-07, + "loss": 0.0068, + "reward": 1.25, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 3952 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.5625, + "epoch": 0.5270666666666667, + "grad_norm": 6.729261875152588, + "kl": 0.20947265625, + "learning_rate": 7.364666666666667e-07, + "loss": 0.0084, + "reward": 1.6875, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 3953 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.0, + "epoch": 0.5272, + "grad_norm": 4.52102518081665, + "kl": 0.1904296875, + "learning_rate": 7.364000000000001e-07, + "loss": 0.0076, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3954 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.3125, + "epoch": 0.5273333333333333, + "grad_norm": 6.260622501373291, + "kl": 0.181640625, + "learning_rate": 7.363333333333332e-07, + "loss": 0.0073, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 3955 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.125, + "epoch": 0.5274666666666666, + "grad_norm": 7.123110294342041, + "kl": 0.16015625, + "learning_rate": 7.362666666666666e-07, + "loss": 0.0064, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 3956 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.75, + "epoch": 0.5276, + "grad_norm": 0.5770318508148193, + "kl": 0.23046875, + "learning_rate": 7.361999999999999e-07, + "loss": 0.0092, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3957 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.1875, + "epoch": 0.5277333333333334, + "grad_norm": 11.397472381591797, + "kl": 0.345703125, + "learning_rate": 7.361333333333332e-07, + "loss": 0.0139, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 3958 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.875, + "epoch": 0.5278666666666667, + "grad_norm": 6.877109050750732, + "kl": 0.23828125, + "learning_rate": 7.360666666666666e-07, + "loss": 0.0096, + "reward": 1.25, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 3959 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.9375, + "epoch": 0.528, + "grad_norm": 8.12778377532959, + "kl": 0.208984375, + "learning_rate": 7.359999999999999e-07, + "loss": 0.0084, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 3960 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.0625, + "epoch": 0.5281333333333333, + "grad_norm": 0.45285800099372864, + "kl": 0.25, + "learning_rate": 7.359333333333333e-07, + "loss": 0.01, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3961 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.125, + "epoch": 0.5282666666666667, + "grad_norm": 4.891201019287109, + "kl": 0.23046875, + "learning_rate": 7.358666666666666e-07, + "loss": 0.0092, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 3962 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.4375, + "epoch": 0.5284, + "grad_norm": 6.5890212059021, + "kl": 0.1865234375, + "learning_rate": 7.358e-07, + "loss": 0.0075, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3963 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.8125, + "epoch": 0.5285333333333333, + "grad_norm": 6.830870628356934, + "kl": 0.17626953125, + "learning_rate": 7.357333333333333e-07, + "loss": 0.0071, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 3964 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.625, + "epoch": 0.5286666666666666, + "grad_norm": 5.396468162536621, + "kl": 0.169921875, + "learning_rate": 7.356666666666667e-07, + "loss": 0.0068, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3965 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.0625, + "epoch": 0.5288, + "grad_norm": 7.625228404998779, + "kl": 0.23193359375, + "learning_rate": 7.356e-07, + "loss": 0.0093, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 3966 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.875, + "epoch": 0.5289333333333334, + "grad_norm": 4.981252670288086, + "kl": 0.23828125, + "learning_rate": 7.355333333333333e-07, + "loss": 0.0095, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 3967 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.625, + "epoch": 0.5290666666666667, + "grad_norm": 4.966890811920166, + "kl": 0.2421875, + "learning_rate": 7.354666666666667e-07, + "loss": 0.0097, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3968 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.1875, + "epoch": 0.5292, + "grad_norm": 8.00937557220459, + "kl": 0.19384765625, + "learning_rate": 7.354e-07, + "loss": 0.0078, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3969 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.9375, + "epoch": 0.5293333333333333, + "grad_norm": 8.345441818237305, + "kl": 0.2099609375, + "learning_rate": 7.353333333333333e-07, + "loss": 0.0084, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3970 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.6875, + "epoch": 0.5294666666666666, + "grad_norm": 10.516895294189453, + "kl": 0.4130859375, + "learning_rate": 7.352666666666666e-07, + "loss": 0.0165, + "reward": 0.9375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.875, + "step": 3971 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.6875, + "epoch": 0.5296, + "grad_norm": 8.049015998840332, + "kl": 0.15869140625, + "learning_rate": 7.352e-07, + "loss": 0.0063, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 3972 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.8125, + "epoch": 0.5297333333333333, + "grad_norm": 4.004299163818359, + "kl": 0.18310546875, + "learning_rate": 7.351333333333333e-07, + "loss": 0.0073, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 3973 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.875, + "epoch": 0.5298666666666667, + "grad_norm": 6.869998931884766, + "kl": 0.1640625, + "learning_rate": 7.350666666666667e-07, + "loss": 0.0066, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 3974 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.8125, + "epoch": 0.53, + "grad_norm": 4.185546398162842, + "kl": 0.26611328125, + "learning_rate": 7.35e-07, + "loss": 0.0106, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3975 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.875, + "epoch": 0.5301333333333333, + "grad_norm": 4.629006385803223, + "kl": 0.17333984375, + "learning_rate": 7.349333333333332e-07, + "loss": 0.0069, + "reward": 1.6875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 3976 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.4375, + "epoch": 0.5302666666666667, + "grad_norm": 9.606649398803711, + "kl": 0.310546875, + "learning_rate": 7.348666666666666e-07, + "loss": 0.0124, + "reward": 1.5625, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 3977 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.375, + "epoch": 0.5304, + "grad_norm": 4.868567943572998, + "kl": 0.24365234375, + "learning_rate": 7.347999999999999e-07, + "loss": 0.0097, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 3978 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.9375, + "epoch": 0.5305333333333333, + "grad_norm": 4.608410358428955, + "kl": 0.18115234375, + "learning_rate": 7.347333333333333e-07, + "loss": 0.0072, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3979 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.4375, + "epoch": 0.5306666666666666, + "grad_norm": 0.4595368802547455, + "kl": 0.1767578125, + "learning_rate": 7.346666666666666e-07, + "loss": 0.0071, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3980 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.375, + "epoch": 0.5308, + "grad_norm": 0.4239494204521179, + "kl": 0.2119140625, + "learning_rate": 7.346e-07, + "loss": 0.0085, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3981 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.9375, + "epoch": 0.5309333333333334, + "grad_norm": 0.37228280305862427, + "kl": 0.2548828125, + "learning_rate": 7.345333333333333e-07, + "loss": 0.0102, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3982 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.625, + "epoch": 0.5310666666666667, + "grad_norm": 3.654360771179199, + "kl": 0.263671875, + "learning_rate": 7.344666666666667e-07, + "loss": 0.0105, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 3983 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.9375, + "epoch": 0.5312, + "grad_norm": 6.394987106323242, + "kl": 0.23779296875, + "learning_rate": 7.344e-07, + "loss": 0.0095, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 3984 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.5625, + "epoch": 0.5313333333333333, + "grad_norm": 5.814209938049316, + "kl": 0.24267578125, + "learning_rate": 7.343333333333332e-07, + "loss": 0.0097, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3985 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.375, + "epoch": 0.5314666666666666, + "grad_norm": 7.369831562042236, + "kl": 0.16357421875, + "learning_rate": 7.342666666666666e-07, + "loss": 0.0065, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 3986 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.125, + "epoch": 0.5316, + "grad_norm": 6.467486381530762, + "kl": 0.212890625, + "learning_rate": 7.341999999999999e-07, + "loss": 0.0085, + "reward": 1.0, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 3987 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.3125, + "epoch": 0.5317333333333333, + "grad_norm": 4.12448263168335, + "kl": 0.16064453125, + "learning_rate": 7.341333333333333e-07, + "loss": 0.0064, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 3988 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.5, + "epoch": 0.5318666666666667, + "grad_norm": 2.881490468978882, + "kl": 0.201904296875, + "learning_rate": 7.340666666666666e-07, + "loss": 0.0081, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 3989 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.0625, + "epoch": 0.532, + "grad_norm": 6.837691783905029, + "kl": 0.33203125, + "learning_rate": 7.34e-07, + "loss": 0.0133, + "reward": 1.0625, + "reward_std": 0.4172614812850952, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.875, + "step": 3990 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.875, + "epoch": 0.5321333333333333, + "grad_norm": 0.33693966269493103, + "kl": 0.17431640625, + "learning_rate": 7.339333333333333e-07, + "loss": 0.007, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3991 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.25, + "epoch": 0.5322666666666667, + "grad_norm": 6.5386738777160645, + "kl": 0.20703125, + "learning_rate": 7.338666666666667e-07, + "loss": 0.0083, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 3992 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.625, + "epoch": 0.5324, + "grad_norm": 0.6498154401779175, + "kl": 0.3017578125, + "learning_rate": 7.338e-07, + "loss": 0.0121, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3993 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.0625, + "epoch": 0.5325333333333333, + "grad_norm": 0.3638383746147156, + "kl": 0.23291015625, + "learning_rate": 7.337333333333334e-07, + "loss": 0.0093, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 3994 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.5625, + "epoch": 0.5326666666666666, + "grad_norm": 7.26329231262207, + "kl": 0.24755859375, + "learning_rate": 7.336666666666667e-07, + "loss": 0.0099, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 3995 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.4375, + "epoch": 0.5328, + "grad_norm": 8.318741798400879, + "kl": 0.15478515625, + "learning_rate": 7.336e-07, + "loss": 0.0062, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3996 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.3125, + "epoch": 0.5329333333333334, + "grad_norm": 10.170365333557129, + "kl": 0.345703125, + "learning_rate": 7.335333333333334e-07, + "loss": 0.0138, + "reward": 1.4375, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 3997 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.5, + "epoch": 0.5330666666666667, + "grad_norm": 11.744296073913574, + "kl": 0.16357421875, + "learning_rate": 7.334666666666666e-07, + "loss": 0.0065, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 3998 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.875, + "epoch": 0.5332, + "grad_norm": 4.082614421844482, + "kl": 0.16064453125, + "learning_rate": 7.334e-07, + "loss": 0.0064, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 3999 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.125, + "epoch": 0.5333333333333333, + "grad_norm": 4.7538628578186035, + "kl": 0.1728515625, + "learning_rate": 7.333333333333332e-07, + "loss": 0.0069, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4000 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.5625, + "epoch": 0.5334666666666666, + "grad_norm": 4.317367076873779, + "kl": 0.205078125, + "learning_rate": 7.332666666666666e-07, + "loss": 0.0082, + "reward": 1.6875, + "reward_std": 0.45806270837783813, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.875, + "step": 4001 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.9375, + "epoch": 0.5336, + "grad_norm": 4.469919204711914, + "kl": 0.3017578125, + "learning_rate": 7.331999999999999e-07, + "loss": 0.012, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 4002 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.0, + "epoch": 0.5337333333333333, + "grad_norm": 0.453155517578125, + "kl": 0.171875, + "learning_rate": 7.331333333333333e-07, + "loss": 0.0069, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4003 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.5, + "epoch": 0.5338666666666667, + "grad_norm": 7.047466278076172, + "kl": 0.1845703125, + "learning_rate": 7.330666666666666e-07, + "loss": 0.0074, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4004 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.6875, + "epoch": 0.534, + "grad_norm": 9.531940460205078, + "kl": 0.3134765625, + "learning_rate": 7.329999999999999e-07, + "loss": 0.0125, + "reward": 1.1875, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 4005 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.6875, + "epoch": 0.5341333333333333, + "grad_norm": 81.90882110595703, + "kl": 0.3544921875, + "learning_rate": 7.329333333333333e-07, + "loss": 0.0142, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.875, + "step": 4006 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.9375, + "epoch": 0.5342666666666667, + "grad_norm": 6.768089771270752, + "kl": 0.33203125, + "learning_rate": 7.328666666666666e-07, + "loss": 0.0133, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 4007 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.25, + "epoch": 0.5344, + "grad_norm": 6.459588050842285, + "kl": 0.2001953125, + "learning_rate": 7.328e-07, + "loss": 0.008, + "reward": 1.5625, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4008 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.1875, + "epoch": 0.5345333333333333, + "grad_norm": 4.059600830078125, + "kl": 0.2197265625, + "learning_rate": 7.327333333333333e-07, + "loss": 0.0088, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4009 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.0625, + "epoch": 0.5346666666666666, + "grad_norm": 3.658942222595215, + "kl": 0.1904296875, + "learning_rate": 7.326666666666667e-07, + "loss": 0.0076, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4010 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.5, + "epoch": 0.5348, + "grad_norm": 6.183532238006592, + "kl": 0.17236328125, + "learning_rate": 7.326e-07, + "loss": 0.0069, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 4011 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.0, + "epoch": 0.5349333333333334, + "grad_norm": 6.037689208984375, + "kl": 0.1650390625, + "learning_rate": 7.325333333333334e-07, + "loss": 0.0066, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4012 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.8125, + "epoch": 0.5350666666666667, + "grad_norm": 5.848928451538086, + "kl": 0.14794921875, + "learning_rate": 7.324666666666667e-07, + "loss": 0.0059, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4013 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.875, + "epoch": 0.5352, + "grad_norm": 0.9771897196769714, + "kl": 0.2802734375, + "learning_rate": 7.324e-07, + "loss": 0.0112, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4014 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.9375, + "epoch": 0.5353333333333333, + "grad_norm": 6.1925435066223145, + "kl": 0.145751953125, + "learning_rate": 7.323333333333333e-07, + "loss": 0.0058, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4015 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.0625, + "epoch": 0.5354666666666666, + "grad_norm": 5.112771511077881, + "kl": 0.19384765625, + "learning_rate": 7.322666666666666e-07, + "loss": 0.0077, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4016 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.9375, + "epoch": 0.5356, + "grad_norm": 0.44343245029449463, + "kl": 0.21142578125, + "learning_rate": 7.322e-07, + "loss": 0.0085, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4017 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.5, + "epoch": 0.5357333333333333, + "grad_norm": 7.653354644775391, + "kl": 0.23193359375, + "learning_rate": 7.321333333333332e-07, + "loss": 0.0093, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4018 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.5625, + "epoch": 0.5358666666666667, + "grad_norm": 3.675055980682373, + "kl": 0.1787109375, + "learning_rate": 7.320666666666666e-07, + "loss": 0.0071, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4019 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.625, + "epoch": 0.536, + "grad_norm": 15.73071575164795, + "kl": 0.248046875, + "learning_rate": 7.319999999999999e-07, + "loss": 0.0099, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4020 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.5, + "epoch": 0.5361333333333334, + "grad_norm": 6.297033786773682, + "kl": 0.181640625, + "learning_rate": 7.319333333333333e-07, + "loss": 0.0073, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4021 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.5, + "epoch": 0.5362666666666667, + "grad_norm": 7.3781585693359375, + "kl": 0.135498046875, + "learning_rate": 7.318666666666666e-07, + "loss": 0.0054, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4022 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.375, + "epoch": 0.5364, + "grad_norm": 12.261430740356445, + "kl": 0.68115234375, + "learning_rate": 7.317999999999999e-07, + "loss": 0.0273, + "reward": 1.25, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 4023 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.8125, + "epoch": 0.5365333333333333, + "grad_norm": 7.962174892425537, + "kl": 0.146484375, + "learning_rate": 7.317333333333333e-07, + "loss": 0.0059, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4024 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.9375, + "epoch": 0.5366666666666666, + "grad_norm": 5.939727783203125, + "kl": 0.16650390625, + "learning_rate": 7.316666666666666e-07, + "loss": 0.0067, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4025 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.0, + "epoch": 0.5368, + "grad_norm": 4.197211265563965, + "kl": 0.18994140625, + "learning_rate": 7.316e-07, + "loss": 0.0076, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4026 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.75, + "epoch": 0.5369333333333334, + "grad_norm": 4.5205912590026855, + "kl": 0.14208984375, + "learning_rate": 7.315333333333333e-07, + "loss": 0.0057, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4027 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.1875, + "epoch": 0.5370666666666667, + "grad_norm": 59.51021194458008, + "kl": 0.23876953125, + "learning_rate": 7.314666666666667e-07, + "loss": 0.0096, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4028 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.25, + "epoch": 0.5372, + "grad_norm": 8.750336647033691, + "kl": 0.236328125, + "learning_rate": 7.314e-07, + "loss": 0.0094, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4029 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.0, + "epoch": 0.5373333333333333, + "grad_norm": 9.499273300170898, + "kl": 0.26123046875, + "learning_rate": 7.313333333333333e-07, + "loss": 0.0105, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4030 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.4375, + "epoch": 0.5374666666666666, + "grad_norm": 4.443542003631592, + "kl": 0.17041015625, + "learning_rate": 7.312666666666666e-07, + "loss": 0.0068, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4031 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.9375, + "epoch": 0.5376, + "grad_norm": 9.701437950134277, + "kl": 0.384765625, + "learning_rate": 7.311999999999999e-07, + "loss": 0.0154, + "reward": 1.25, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.8125, + "step": 4032 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.375, + "epoch": 0.5377333333333333, + "grad_norm": 5.662271976470947, + "kl": 0.122314453125, + "learning_rate": 7.311333333333333e-07, + "loss": 0.0049, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4033 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.875, + "epoch": 0.5378666666666667, + "grad_norm": 9.730521202087402, + "kl": 0.17626953125, + "learning_rate": 7.310666666666666e-07, + "loss": 0.007, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4034 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.3125, + "epoch": 0.538, + "grad_norm": 5.947269916534424, + "kl": 0.294921875, + "learning_rate": 7.31e-07, + "loss": 0.0118, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4035 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.1875, + "epoch": 0.5381333333333334, + "grad_norm": 8.676042556762695, + "kl": 0.3095703125, + "learning_rate": 7.309333333333333e-07, + "loss": 0.0124, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4036 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.3125, + "epoch": 0.5382666666666667, + "grad_norm": 7.645419597625732, + "kl": 0.23876953125, + "learning_rate": 7.308666666666667e-07, + "loss": 0.0096, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 4037 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.875, + "epoch": 0.5384, + "grad_norm": 38.331851959228516, + "kl": 0.16748046875, + "learning_rate": 7.308e-07, + "loss": 0.0067, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4038 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.875, + "epoch": 0.5385333333333333, + "grad_norm": 5.330282211303711, + "kl": 0.19775390625, + "learning_rate": 7.307333333333334e-07, + "loss": 0.0079, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4039 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.25, + "epoch": 0.5386666666666666, + "grad_norm": 5.4678053855896, + "kl": 0.2451171875, + "learning_rate": 7.306666666666666e-07, + "loss": 0.0098, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4040 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.4375, + "epoch": 0.5388, + "grad_norm": 7.092830657958984, + "kl": 0.1796875, + "learning_rate": 7.306e-07, + "loss": 0.0072, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4041 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.875, + "epoch": 0.5389333333333334, + "grad_norm": 5.560853481292725, + "kl": 0.27490234375, + "learning_rate": 7.305333333333333e-07, + "loss": 0.011, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.875, + "step": 4042 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.0625, + "epoch": 0.5390666666666667, + "grad_norm": 7.184783935546875, + "kl": 0.21240234375, + "learning_rate": 7.304666666666666e-07, + "loss": 0.0085, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 4043 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.9375, + "epoch": 0.5392, + "grad_norm": 8.09708023071289, + "kl": 0.2177734375, + "learning_rate": 7.304e-07, + "loss": 0.0087, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4044 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.75, + "epoch": 0.5393333333333333, + "grad_norm": 6.850564479827881, + "kl": 0.205078125, + "learning_rate": 7.303333333333332e-07, + "loss": 0.0082, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4045 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.8125, + "epoch": 0.5394666666666666, + "grad_norm": 8.564508438110352, + "kl": 0.22509765625, + "learning_rate": 7.302666666666666e-07, + "loss": 0.009, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4046 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.375, + "epoch": 0.5396, + "grad_norm": 7.91030216217041, + "kl": 0.26953125, + "learning_rate": 7.301999999999999e-07, + "loss": 0.0108, + "reward": 1.25, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 4047 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.625, + "epoch": 0.5397333333333333, + "grad_norm": 7.218141555786133, + "kl": 0.18505859375, + "learning_rate": 7.301333333333333e-07, + "loss": 0.0074, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 4048 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.375, + "epoch": 0.5398666666666667, + "grad_norm": 9.877035140991211, + "kl": 0.26806640625, + "learning_rate": 7.300666666666666e-07, + "loss": 0.0107, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 4049 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.9375, + "epoch": 0.54, + "grad_norm": 7.834061622619629, + "kl": 0.17041015625, + "learning_rate": 7.3e-07, + "loss": 0.0068, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4050 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.625, + "epoch": 0.5401333333333334, + "grad_norm": 7.137020111083984, + "kl": 0.119873046875, + "learning_rate": 7.299333333333333e-07, + "loss": 0.0048, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4051 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.0, + "epoch": 0.5402666666666667, + "grad_norm": 6.9095611572265625, + "kl": 0.30078125, + "learning_rate": 7.298666666666666e-07, + "loss": 0.012, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4052 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.5625, + "epoch": 0.5404, + "grad_norm": 6.775294303894043, + "kl": 0.173095703125, + "learning_rate": 7.298e-07, + "loss": 0.0069, + "reward": 1.5625, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4053 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.0, + "epoch": 0.5405333333333333, + "grad_norm": 8.102633476257324, + "kl": 0.24365234375, + "learning_rate": 7.297333333333333e-07, + "loss": 0.0097, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 4054 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.3125, + "epoch": 0.5406666666666666, + "grad_norm": 6.854649543762207, + "kl": 0.20654296875, + "learning_rate": 7.296666666666667e-07, + "loss": 0.0083, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4055 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.5625, + "epoch": 0.5408, + "grad_norm": 8.480731964111328, + "kl": 0.13916015625, + "learning_rate": 7.296e-07, + "loss": 0.0056, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4056 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.3125, + "epoch": 0.5409333333333334, + "grad_norm": 8.61215877532959, + "kl": 0.149658203125, + "learning_rate": 7.295333333333334e-07, + "loss": 0.006, + "reward": 1.375, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4057 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.3125, + "epoch": 0.5410666666666667, + "grad_norm": 4.917953014373779, + "kl": 0.17626953125, + "learning_rate": 7.294666666666667e-07, + "loss": 0.007, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4058 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.125, + "epoch": 0.5412, + "grad_norm": 3.2062864303588867, + "kl": 0.1982421875, + "learning_rate": 7.294000000000001e-07, + "loss": 0.0079, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4059 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.875, + "epoch": 0.5413333333333333, + "grad_norm": 2.9003782272338867, + "kl": 0.1396484375, + "learning_rate": 7.293333333333332e-07, + "loss": 0.0056, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4060 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.3125, + "epoch": 0.5414666666666667, + "grad_norm": 9.708900451660156, + "kl": 0.1953125, + "learning_rate": 7.292666666666665e-07, + "loss": 0.0078, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4061 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.5625, + "epoch": 0.5416, + "grad_norm": 6.786929130554199, + "kl": 0.15576171875, + "learning_rate": 7.291999999999999e-07, + "loss": 0.0062, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4062 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.4375, + "epoch": 0.5417333333333333, + "grad_norm": 84.89502716064453, + "kl": 0.18603515625, + "learning_rate": 7.291333333333332e-07, + "loss": 0.0075, + "reward": 1.4375, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 4063 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.1875, + "epoch": 0.5418666666666667, + "grad_norm": 7.942811965942383, + "kl": 0.18701171875, + "learning_rate": 7.290666666666666e-07, + "loss": 0.0075, + "reward": 1.5625, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4064 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.6875, + "epoch": 0.542, + "grad_norm": 7.384347915649414, + "kl": 0.1455078125, + "learning_rate": 7.289999999999999e-07, + "loss": 0.0058, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4065 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.4375, + "epoch": 0.5421333333333334, + "grad_norm": 6.71801233291626, + "kl": 0.2578125, + "learning_rate": 7.289333333333333e-07, + "loss": 0.0103, + "reward": 1.375, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 4066 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.0, + "epoch": 0.5422666666666667, + "grad_norm": 6.554612636566162, + "kl": 0.130859375, + "learning_rate": 7.288666666666666e-07, + "loss": 0.0052, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4067 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.0625, + "epoch": 0.5424, + "grad_norm": 5.4620208740234375, + "kl": 0.14013671875, + "learning_rate": 7.288e-07, + "loss": 0.0056, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4068 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.5, + "epoch": 0.5425333333333333, + "grad_norm": 6.835484504699707, + "kl": 0.466796875, + "learning_rate": 7.287333333333333e-07, + "loss": 0.0187, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 4069 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.6875, + "epoch": 0.5426666666666666, + "grad_norm": 6.60703182220459, + "kl": 0.18310546875, + "learning_rate": 7.286666666666666e-07, + "loss": 0.0073, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4070 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.5, + "epoch": 0.5428, + "grad_norm": 5.780707359313965, + "kl": 0.132568359375, + "learning_rate": 7.286e-07, + "loss": 0.0053, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4071 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.5625, + "epoch": 0.5429333333333334, + "grad_norm": 6.312164783477783, + "kl": 0.181640625, + "learning_rate": 7.285333333333333e-07, + "loss": 0.0073, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 4072 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.25, + "epoch": 0.5430666666666667, + "grad_norm": 3.335344076156616, + "kl": 0.1494140625, + "learning_rate": 7.284666666666667e-07, + "loss": 0.006, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 4073 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.5625, + "epoch": 0.5432, + "grad_norm": 5.179124355316162, + "kl": 0.23046875, + "learning_rate": 7.284e-07, + "loss": 0.0092, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4074 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.5625, + "epoch": 0.5433333333333333, + "grad_norm": 6.776741027832031, + "kl": 0.185546875, + "learning_rate": 7.283333333333334e-07, + "loss": 0.0074, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4075 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.625, + "epoch": 0.5434666666666667, + "grad_norm": 3.8019070625305176, + "kl": 0.123046875, + "learning_rate": 7.282666666666666e-07, + "loss": 0.0049, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4076 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.75, + "epoch": 0.5436, + "grad_norm": 3.9871339797973633, + "kl": 0.1904296875, + "learning_rate": 7.282e-07, + "loss": 0.0076, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4077 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.5, + "epoch": 0.5437333333333333, + "grad_norm": 4.356377124786377, + "kl": 0.17822265625, + "learning_rate": 7.281333333333333e-07, + "loss": 0.0071, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4078 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.875, + "epoch": 0.5438666666666667, + "grad_norm": 0.32810258865356445, + "kl": 0.20654296875, + "learning_rate": 7.280666666666666e-07, + "loss": 0.0083, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4079 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.3125, + "epoch": 0.544, + "grad_norm": 7.271770477294922, + "kl": 0.32763671875, + "learning_rate": 7.28e-07, + "loss": 0.0131, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.875, + "step": 4080 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.875, + "epoch": 0.5441333333333334, + "grad_norm": 7.601857662200928, + "kl": 0.25, + "learning_rate": 7.279333333333332e-07, + "loss": 0.01, + "reward": 1.3125, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 4081 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.9375, + "epoch": 0.5442666666666667, + "grad_norm": 6.934513568878174, + "kl": 0.140380859375, + "learning_rate": 7.278666666666666e-07, + "loss": 0.0056, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4082 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.3125, + "epoch": 0.5444, + "grad_norm": 7.948519706726074, + "kl": 0.1875, + "learning_rate": 7.277999999999999e-07, + "loss": 0.0075, + "reward": 1.0625, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.875, + "step": 4083 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.25, + "epoch": 0.5445333333333333, + "grad_norm": 5.597686767578125, + "kl": 0.19677734375, + "learning_rate": 7.277333333333333e-07, + "loss": 0.0079, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4084 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.5625, + "epoch": 0.5446666666666666, + "grad_norm": 6.027921199798584, + "kl": 0.142578125, + "learning_rate": 7.276666666666666e-07, + "loss": 0.0057, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 4085 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.125, + "epoch": 0.5448, + "grad_norm": 6.819101810455322, + "kl": 0.15771484375, + "learning_rate": 7.276e-07, + "loss": 0.0063, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 4086 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.4375, + "epoch": 0.5449333333333334, + "grad_norm": 3.636545181274414, + "kl": 0.15283203125, + "learning_rate": 7.275333333333333e-07, + "loss": 0.0061, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4087 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.9375, + "epoch": 0.5450666666666667, + "grad_norm": 3.481921434402466, + "kl": 0.216796875, + "learning_rate": 7.274666666666667e-07, + "loss": 0.0087, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4088 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.125, + "epoch": 0.5452, + "grad_norm": 7.215536594390869, + "kl": 0.169921875, + "learning_rate": 7.274e-07, + "loss": 0.0068, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 4089 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.125, + "epoch": 0.5453333333333333, + "grad_norm": 7.152603626251221, + "kl": 0.19091796875, + "learning_rate": 7.273333333333333e-07, + "loss": 0.0076, + "reward": 1.1875, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 4090 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.6875, + "epoch": 0.5454666666666667, + "grad_norm": 4.136414527893066, + "kl": 0.17724609375, + "learning_rate": 7.272666666666666e-07, + "loss": 0.0071, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4091 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.375, + "epoch": 0.5456, + "grad_norm": 8.928088188171387, + "kl": 0.2685546875, + "learning_rate": 7.271999999999999e-07, + "loss": 0.0108, + "reward": 1.6875, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 4092 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.25, + "epoch": 0.5457333333333333, + "grad_norm": 11.123517036437988, + "kl": 0.16650390625, + "learning_rate": 7.271333333333333e-07, + "loss": 0.0067, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 4093 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.8125, + "epoch": 0.5458666666666666, + "grad_norm": 6.204432964324951, + "kl": 0.173828125, + "learning_rate": 7.270666666666666e-07, + "loss": 0.007, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4094 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.625, + "epoch": 0.546, + "grad_norm": 6.830376148223877, + "kl": 0.146484375, + "learning_rate": 7.27e-07, + "loss": 0.0059, + "reward": 1.3125, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 4095 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.8125, + "epoch": 0.5461333333333334, + "grad_norm": 31.71135139465332, + "kl": 1.60107421875, + "learning_rate": 7.269333333333333e-07, + "loss": 0.0643, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4096 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.0, + "epoch": 0.5462666666666667, + "grad_norm": 8.588899612426758, + "kl": 0.2919921875, + "learning_rate": 7.268666666666667e-07, + "loss": 0.0117, + "reward": 1.3125, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 4097 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.6875, + "epoch": 0.5464, + "grad_norm": 5.878177642822266, + "kl": 0.1259765625, + "learning_rate": 7.268e-07, + "loss": 0.005, + "reward": 1.25, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 4098 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.5625, + "epoch": 0.5465333333333333, + "grad_norm": 6.89259672164917, + "kl": 0.20166015625, + "learning_rate": 7.267333333333333e-07, + "loss": 0.0081, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4099 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.8125, + "epoch": 0.5466666666666666, + "grad_norm": 0.35519304871559143, + "kl": 0.147705078125, + "learning_rate": 7.266666666666667e-07, + "loss": 0.0059, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4100 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.1875, + "epoch": 0.5468, + "grad_norm": 4.8898820877075195, + "kl": 0.2734375, + "learning_rate": 7.266e-07, + "loss": 0.011, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4101 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.75, + "epoch": 0.5469333333333334, + "grad_norm": 6.286288261413574, + "kl": 0.23974609375, + "learning_rate": 7.265333333333334e-07, + "loss": 0.0096, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4102 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.875, + "epoch": 0.5470666666666667, + "grad_norm": 6.430427074432373, + "kl": 0.162109375, + "learning_rate": 7.264666666666666e-07, + "loss": 0.0065, + "reward": 1.6875, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 4103 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.0, + "epoch": 0.5472, + "grad_norm": 29.773208618164062, + "kl": 0.1904296875, + "learning_rate": 7.264e-07, + "loss": 0.0076, + "reward": 1.125, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 4104 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.1875, + "epoch": 0.5473333333333333, + "grad_norm": 8.646759986877441, + "kl": 0.2314453125, + "learning_rate": 7.263333333333333e-07, + "loss": 0.0093, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.875, + "step": 4105 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.5, + "epoch": 0.5474666666666667, + "grad_norm": 6.839092254638672, + "kl": 0.155517578125, + "learning_rate": 7.262666666666666e-07, + "loss": 0.0062, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4106 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.25, + "epoch": 0.5476, + "grad_norm": 6.7739458084106445, + "kl": 0.169921875, + "learning_rate": 7.261999999999999e-07, + "loss": 0.0068, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4107 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.625, + "epoch": 0.5477333333333333, + "grad_norm": 9.198273658752441, + "kl": 0.15966796875, + "learning_rate": 7.261333333333332e-07, + "loss": 0.0064, + "reward": 1.125, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 4108 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.0, + "epoch": 0.5478666666666666, + "grad_norm": 6.95687198638916, + "kl": 0.1982421875, + "learning_rate": 7.260666666666666e-07, + "loss": 0.0079, + "reward": 1.3125, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 4109 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.3125, + "epoch": 0.548, + "grad_norm": 6.562494277954102, + "kl": 0.21826171875, + "learning_rate": 7.259999999999999e-07, + "loss": 0.0087, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 4110 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.375, + "epoch": 0.5481333333333334, + "grad_norm": 4.6587300300598145, + "kl": 0.23583984375, + "learning_rate": 7.259333333333333e-07, + "loss": 0.0094, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 4111 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.25, + "epoch": 0.5482666666666667, + "grad_norm": 9.470560073852539, + "kl": 0.17041015625, + "learning_rate": 7.258666666666666e-07, + "loss": 0.0068, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4112 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.375, + "epoch": 0.5484, + "grad_norm": 5.274168014526367, + "kl": 0.1884765625, + "learning_rate": 7.258e-07, + "loss": 0.0075, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4113 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.0625, + "epoch": 0.5485333333333333, + "grad_norm": 0.39878538250923157, + "kl": 0.2197265625, + "learning_rate": 7.257333333333333e-07, + "loss": 0.0088, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4114 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.0625, + "epoch": 0.5486666666666666, + "grad_norm": 5.126521110534668, + "kl": 0.21875, + "learning_rate": 7.256666666666667e-07, + "loss": 0.0087, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4115 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.0625, + "epoch": 0.5488, + "grad_norm": 5.769069671630859, + "kl": 0.13720703125, + "learning_rate": 7.256e-07, + "loss": 0.0055, + "reward": 1.125, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 4116 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.75, + "epoch": 0.5489333333333334, + "grad_norm": 10.68373966217041, + "kl": 0.177734375, + "learning_rate": 7.255333333333333e-07, + "loss": 0.0071, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4117 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.0, + "epoch": 0.5490666666666667, + "grad_norm": 0.3452586531639099, + "kl": 0.17431640625, + "learning_rate": 7.254666666666667e-07, + "loss": 0.007, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4118 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.5625, + "epoch": 0.5492, + "grad_norm": 3.446183204650879, + "kl": 0.1787109375, + "learning_rate": 7.254e-07, + "loss": 0.0072, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4119 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.5625, + "epoch": 0.5493333333333333, + "grad_norm": 6.6461358070373535, + "kl": 0.1748046875, + "learning_rate": 7.253333333333334e-07, + "loss": 0.007, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 4120 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.9375, + "epoch": 0.5494666666666667, + "grad_norm": 8.155607223510742, + "kl": 0.1845703125, + "learning_rate": 7.252666666666666e-07, + "loss": 0.0074, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 4121 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.5625, + "epoch": 0.5496, + "grad_norm": 9.026846885681152, + "kl": 0.27490234375, + "learning_rate": 7.252e-07, + "loss": 0.011, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4122 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.125, + "epoch": 0.5497333333333333, + "grad_norm": 4.310037612915039, + "kl": 0.2373046875, + "learning_rate": 7.251333333333332e-07, + "loss": 0.0095, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4123 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.875, + "epoch": 0.5498666666666666, + "grad_norm": 6.897101879119873, + "kl": 0.18994140625, + "learning_rate": 7.250666666666666e-07, + "loss": 0.0076, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4124 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.4375, + "epoch": 0.55, + "grad_norm": 7.340378284454346, + "kl": 0.14599609375, + "learning_rate": 7.249999999999999e-07, + "loss": 0.0058, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4125 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.0, + "epoch": 0.5501333333333334, + "grad_norm": 4.5945210456848145, + "kl": 0.1435546875, + "learning_rate": 7.249333333333332e-07, + "loss": 0.0057, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4126 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.125, + "epoch": 0.5502666666666667, + "grad_norm": 7.652543067932129, + "kl": 0.1728515625, + "learning_rate": 7.248666666666666e-07, + "loss": 0.0069, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 4127 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.6875, + "epoch": 0.5504, + "grad_norm": 8.306326866149902, + "kl": 0.150634765625, + "learning_rate": 7.247999999999999e-07, + "loss": 0.006, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 4128 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.625, + "epoch": 0.5505333333333333, + "grad_norm": 7.005170822143555, + "kl": 0.15625, + "learning_rate": 7.247333333333333e-07, + "loss": 0.0062, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4129 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.375, + "epoch": 0.5506666666666666, + "grad_norm": 9.3284330368042, + "kl": 0.25, + "learning_rate": 7.246666666666666e-07, + "loss": 0.01, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4130 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.5, + "epoch": 0.5508, + "grad_norm": 3.742234706878662, + "kl": 0.18115234375, + "learning_rate": 7.246e-07, + "loss": 0.0073, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4131 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.9375, + "epoch": 0.5509333333333334, + "grad_norm": 6.351268768310547, + "kl": 0.1787109375, + "learning_rate": 7.245333333333333e-07, + "loss": 0.0071, + "reward": 1.1875, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 4132 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.75, + "epoch": 0.5510666666666667, + "grad_norm": 38.685848236083984, + "kl": 2.64208984375, + "learning_rate": 7.244666666666667e-07, + "loss": 0.1056, + "reward": 1.4375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.8125, + "step": 4133 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.625, + "epoch": 0.5512, + "grad_norm": 7.95183801651001, + "kl": 0.20703125, + "learning_rate": 7.244e-07, + "loss": 0.0083, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4134 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.0, + "epoch": 0.5513333333333333, + "grad_norm": 5.584986686706543, + "kl": 0.23828125, + "learning_rate": 7.243333333333334e-07, + "loss": 0.0095, + "reward": 1.25, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 4135 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.5, + "epoch": 0.5514666666666667, + "grad_norm": 7.54208517074585, + "kl": 0.154052734375, + "learning_rate": 7.242666666666666e-07, + "loss": 0.0062, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4136 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.6875, + "epoch": 0.5516, + "grad_norm": 4.808022499084473, + "kl": 0.13525390625, + "learning_rate": 7.241999999999999e-07, + "loss": 0.0054, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 4137 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.375, + "epoch": 0.5517333333333333, + "grad_norm": 0.3668343424797058, + "kl": 0.1943359375, + "learning_rate": 7.241333333333333e-07, + "loss": 0.0078, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4138 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.3125, + "epoch": 0.5518666666666666, + "grad_norm": 7.0997700691223145, + "kl": 0.14990234375, + "learning_rate": 7.240666666666666e-07, + "loss": 0.006, + "reward": 1.6875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 4139 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.9375, + "epoch": 0.552, + "grad_norm": 8.1074800491333, + "kl": 0.26611328125, + "learning_rate": 7.24e-07, + "loss": 0.0106, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4140 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.0, + "epoch": 0.5521333333333334, + "grad_norm": 4.899609565734863, + "kl": 0.138671875, + "learning_rate": 7.239333333333333e-07, + "loss": 0.0055, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4141 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.0, + "epoch": 0.5522666666666667, + "grad_norm": 4.927398681640625, + "kl": 0.2294921875, + "learning_rate": 7.238666666666667e-07, + "loss": 0.0092, + "reward": 1.125, + "reward_std": 0.4432026147842407, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.875, + "step": 4142 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.0, + "epoch": 0.5524, + "grad_norm": 15.58929443359375, + "kl": 0.7734375, + "learning_rate": 7.238e-07, + "loss": 0.0309, + "reward": 1.3125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 4143 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.625, + "epoch": 0.5525333333333333, + "grad_norm": 5.435781478881836, + "kl": 0.26123046875, + "learning_rate": 7.237333333333334e-07, + "loss": 0.0104, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4144 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.5, + "epoch": 0.5526666666666666, + "grad_norm": 19.981754302978516, + "kl": 0.3876953125, + "learning_rate": 7.236666666666666e-07, + "loss": 0.0155, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.8125, + "step": 4145 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.8125, + "epoch": 0.5528, + "grad_norm": 6.044276237487793, + "kl": 0.1416015625, + "learning_rate": 7.235999999999999e-07, + "loss": 0.0057, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4146 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.0625, + "epoch": 0.5529333333333334, + "grad_norm": 6.324508190155029, + "kl": 0.20703125, + "learning_rate": 7.235333333333333e-07, + "loss": 0.0083, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4147 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.4375, + "epoch": 0.5530666666666667, + "grad_norm": 8.571894645690918, + "kl": 0.2685546875, + "learning_rate": 7.234666666666666e-07, + "loss": 0.0108, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4148 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.25, + "epoch": 0.5532, + "grad_norm": 9.301763534545898, + "kl": 0.26171875, + "learning_rate": 7.234e-07, + "loss": 0.0105, + "reward": 1.0625, + "reward_std": 0.5876962244510651, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.875, + "step": 4149 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.25, + "epoch": 0.5533333333333333, + "grad_norm": 9.2692289352417, + "kl": 0.13037109375, + "learning_rate": 7.233333333333333e-07, + "loss": 0.0052, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4150 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.875, + "epoch": 0.5534666666666667, + "grad_norm": 0.5041899085044861, + "kl": 0.23779296875, + "learning_rate": 7.232666666666666e-07, + "loss": 0.0095, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4151 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.4375, + "epoch": 0.5536, + "grad_norm": 0.3204343020915985, + "kl": 0.173828125, + "learning_rate": 7.231999999999999e-07, + "loss": 0.0069, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4152 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.0625, + "epoch": 0.5537333333333333, + "grad_norm": 7.908004283905029, + "kl": 0.24462890625, + "learning_rate": 7.231333333333333e-07, + "loss": 0.0098, + "reward": 1.5, + "reward_std": 0.6452257037162781, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4153 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.5, + "epoch": 0.5538666666666666, + "grad_norm": 5.737460613250732, + "kl": 0.22314453125, + "learning_rate": 7.230666666666666e-07, + "loss": 0.0089, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4154 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.125, + "epoch": 0.554, + "grad_norm": 5.397146701812744, + "kl": 0.16015625, + "learning_rate": 7.229999999999999e-07, + "loss": 0.0064, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4155 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.0, + "epoch": 0.5541333333333334, + "grad_norm": 6.810408115386963, + "kl": 0.26220703125, + "learning_rate": 7.229333333333333e-07, + "loss": 0.0105, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4156 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.0, + "epoch": 0.5542666666666667, + "grad_norm": 8.363146781921387, + "kl": 0.2490234375, + "learning_rate": 7.228666666666666e-07, + "loss": 0.0099, + "reward": 1.0625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 4157 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.4375, + "epoch": 0.5544, + "grad_norm": 4.74275016784668, + "kl": 0.13232421875, + "learning_rate": 7.228e-07, + "loss": 0.0053, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4158 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.25, + "epoch": 0.5545333333333333, + "grad_norm": 7.6596832275390625, + "kl": 0.17626953125, + "learning_rate": 7.227333333333333e-07, + "loss": 0.0071, + "reward": 1.5625, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4159 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.75, + "epoch": 0.5546666666666666, + "grad_norm": 8.835427284240723, + "kl": 0.2119140625, + "learning_rate": 7.226666666666667e-07, + "loss": 0.0085, + "reward": 1.3125, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 4160 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.5, + "epoch": 0.5548, + "grad_norm": 8.32198715209961, + "kl": 0.20654296875, + "learning_rate": 7.226e-07, + "loss": 0.0082, + "reward": 1.5, + "reward_std": 0.7071067690849304, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 4161 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.875, + "epoch": 0.5549333333333333, + "grad_norm": 6.933828830718994, + "kl": 0.24853515625, + "learning_rate": 7.225333333333334e-07, + "loss": 0.01, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 4162 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.5625, + "epoch": 0.5550666666666667, + "grad_norm": 5.815232753753662, + "kl": 0.14697265625, + "learning_rate": 7.224666666666667e-07, + "loss": 0.0059, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4163 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.5, + "epoch": 0.5552, + "grad_norm": 0.4467938244342804, + "kl": 0.2509765625, + "learning_rate": 7.224e-07, + "loss": 0.0101, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4164 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.4375, + "epoch": 0.5553333333333333, + "grad_norm": 5.950343132019043, + "kl": 0.24072265625, + "learning_rate": 7.223333333333334e-07, + "loss": 0.0096, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4165 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.5, + "epoch": 0.5554666666666667, + "grad_norm": 6.816548824310303, + "kl": 0.18896484375, + "learning_rate": 7.222666666666665e-07, + "loss": 0.0076, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4166 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.3125, + "epoch": 0.5556, + "grad_norm": 6.078239440917969, + "kl": 0.228515625, + "learning_rate": 7.221999999999999e-07, + "loss": 0.0091, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 4167 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.0, + "epoch": 0.5557333333333333, + "grad_norm": 4.619035243988037, + "kl": 0.140625, + "learning_rate": 7.221333333333332e-07, + "loss": 0.0056, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4168 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.25, + "epoch": 0.5558666666666666, + "grad_norm": 4.630059719085693, + "kl": 0.19091796875, + "learning_rate": 7.220666666666666e-07, + "loss": 0.0076, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4169 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.3125, + "epoch": 0.556, + "grad_norm": 0.3150903582572937, + "kl": 0.17138671875, + "learning_rate": 7.219999999999999e-07, + "loss": 0.0068, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4170 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.4375, + "epoch": 0.5561333333333334, + "grad_norm": 6.025624752044678, + "kl": 0.3486328125, + "learning_rate": 7.219333333333333e-07, + "loss": 0.0139, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4171 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.75, + "epoch": 0.5562666666666667, + "grad_norm": 9.68267822265625, + "kl": 0.3017578125, + "learning_rate": 7.218666666666666e-07, + "loss": 0.0121, + "reward": 1.25, + "reward_std": 0.7559289336204529, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.8125, + "step": 4172 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.625, + "epoch": 0.5564, + "grad_norm": 7.806920051574707, + "kl": 0.17236328125, + "learning_rate": 7.217999999999999e-07, + "loss": 0.0069, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4173 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.4375, + "epoch": 0.5565333333333333, + "grad_norm": 8.945558547973633, + "kl": 0.1953125, + "learning_rate": 7.217333333333333e-07, + "loss": 0.0078, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4174 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.625, + "epoch": 0.5566666666666666, + "grad_norm": 11.769087791442871, + "kl": 0.2265625, + "learning_rate": 7.216666666666666e-07, + "loss": 0.0091, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4175 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.0, + "epoch": 0.5568, + "grad_norm": 0.4365101754665375, + "kl": 0.298828125, + "learning_rate": 7.216e-07, + "loss": 0.012, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4176 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.6875, + "epoch": 0.5569333333333333, + "grad_norm": 8.485132217407227, + "kl": 0.208984375, + "learning_rate": 7.215333333333333e-07, + "loss": 0.0084, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4177 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.9375, + "epoch": 0.5570666666666667, + "grad_norm": 6.20328426361084, + "kl": 0.18212890625, + "learning_rate": 7.214666666666667e-07, + "loss": 0.0073, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4178 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.125, + "epoch": 0.5572, + "grad_norm": 0.5088522434234619, + "kl": 0.134765625, + "learning_rate": 7.214e-07, + "loss": 0.0054, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4179 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.125, + "epoch": 0.5573333333333333, + "grad_norm": 5.84088134765625, + "kl": 0.205078125, + "learning_rate": 7.213333333333334e-07, + "loss": 0.0082, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 4180 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.125, + "epoch": 0.5574666666666667, + "grad_norm": 0.38690894842147827, + "kl": 0.24755859375, + "learning_rate": 7.212666666666666e-07, + "loss": 0.0099, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4181 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.4375, + "epoch": 0.5576, + "grad_norm": 9.34630012512207, + "kl": 0.27734375, + "learning_rate": 7.211999999999999e-07, + "loss": 0.0111, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4182 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.1875, + "epoch": 0.5577333333333333, + "grad_norm": 7.570427417755127, + "kl": 0.150390625, + "learning_rate": 7.211333333333333e-07, + "loss": 0.006, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4183 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.4375, + "epoch": 0.5578666666666666, + "grad_norm": 11.508764266967773, + "kl": 0.15625, + "learning_rate": 7.210666666666666e-07, + "loss": 0.0062, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4184 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.9375, + "epoch": 0.558, + "grad_norm": 8.480989456176758, + "kl": 0.19287109375, + "learning_rate": 7.21e-07, + "loss": 0.0077, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4185 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.75, + "epoch": 0.5581333333333334, + "grad_norm": 11.789133071899414, + "kl": 0.26318359375, + "learning_rate": 7.209333333333333e-07, + "loss": 0.0105, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4186 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.8125, + "epoch": 0.5582666666666667, + "grad_norm": 8.626102447509766, + "kl": 0.15673828125, + "learning_rate": 7.208666666666666e-07, + "loss": 0.0063, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4187 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.75, + "epoch": 0.5584, + "grad_norm": 5.3263702392578125, + "kl": 0.259765625, + "learning_rate": 7.207999999999999e-07, + "loss": 0.0104, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4188 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.375, + "epoch": 0.5585333333333333, + "grad_norm": 4.297213077545166, + "kl": 0.27685546875, + "learning_rate": 7.207333333333333e-07, + "loss": 0.0111, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4189 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.6875, + "epoch": 0.5586666666666666, + "grad_norm": 10.612186431884766, + "kl": 0.13427734375, + "learning_rate": 7.206666666666666e-07, + "loss": 0.0054, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 4190 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.25, + "epoch": 0.5588, + "grad_norm": 5.399046897888184, + "kl": 0.14599609375, + "learning_rate": 7.206e-07, + "loss": 0.0059, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4191 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.1875, + "epoch": 0.5589333333333333, + "grad_norm": 7.562056541442871, + "kl": 0.19873046875, + "learning_rate": 7.205333333333333e-07, + "loss": 0.0079, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4192 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.9375, + "epoch": 0.5590666666666667, + "grad_norm": 25.791597366333008, + "kl": 0.19677734375, + "learning_rate": 7.204666666666666e-07, + "loss": 0.0079, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4193 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.1875, + "epoch": 0.5592, + "grad_norm": 10.369585037231445, + "kl": 0.4052734375, + "learning_rate": 7.204e-07, + "loss": 0.0162, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4194 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.0625, + "epoch": 0.5593333333333333, + "grad_norm": 5.824217796325684, + "kl": 0.16943359375, + "learning_rate": 7.203333333333333e-07, + "loss": 0.0068, + "reward": 1.1875, + "reward_std": 0.6983994543552399, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.8125, + "step": 4195 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.5625, + "epoch": 0.5594666666666667, + "grad_norm": 6.854342937469482, + "kl": 0.17236328125, + "learning_rate": 7.202666666666667e-07, + "loss": 0.0069, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4196 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.8125, + "epoch": 0.5596, + "grad_norm": 12.229019165039062, + "kl": 0.1875, + "learning_rate": 7.201999999999999e-07, + "loss": 0.0075, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4197 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.0, + "epoch": 0.5597333333333333, + "grad_norm": 4.931234359741211, + "kl": 0.20458984375, + "learning_rate": 7.201333333333333e-07, + "loss": 0.0082, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4198 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.3125, + "epoch": 0.5598666666666666, + "grad_norm": 5.114046096801758, + "kl": 0.203125, + "learning_rate": 7.200666666666666e-07, + "loss": 0.0081, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4199 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.125, + "epoch": 0.56, + "grad_norm": 9.269318580627441, + "kl": 0.2607421875, + "learning_rate": 7.2e-07, + "loss": 0.0104, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4200 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.3125, + "epoch": 0.5601333333333334, + "grad_norm": 4.542038917541504, + "kl": 0.22265625, + "learning_rate": 7.199333333333333e-07, + "loss": 0.0089, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4201 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.75, + "epoch": 0.5602666666666667, + "grad_norm": 7.035290241241455, + "kl": 0.16455078125, + "learning_rate": 7.198666666666666e-07, + "loss": 0.0066, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4202 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.0, + "epoch": 0.5604, + "grad_norm": 4.888786792755127, + "kl": 0.1787109375, + "learning_rate": 7.198e-07, + "loss": 0.0072, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4203 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.5, + "epoch": 0.5605333333333333, + "grad_norm": 7.3580780029296875, + "kl": 0.15478515625, + "learning_rate": 7.197333333333333e-07, + "loss": 0.0062, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4204 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.875, + "epoch": 0.5606666666666666, + "grad_norm": 8.50417709350586, + "kl": 0.18603515625, + "learning_rate": 7.196666666666667e-07, + "loss": 0.0074, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4205 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.3125, + "epoch": 0.5608, + "grad_norm": 6.436810493469238, + "kl": 0.234375, + "learning_rate": 7.196e-07, + "loss": 0.0094, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4206 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.75, + "epoch": 0.5609333333333333, + "grad_norm": 7.549186706542969, + "kl": 0.206298828125, + "learning_rate": 7.195333333333334e-07, + "loss": 0.0083, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4207 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.8125, + "epoch": 0.5610666666666667, + "grad_norm": 7.692330360412598, + "kl": 0.2529296875, + "learning_rate": 7.194666666666667e-07, + "loss": 0.0101, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4208 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.125, + "epoch": 0.5612, + "grad_norm": 7.427379608154297, + "kl": 0.16748046875, + "learning_rate": 7.194e-07, + "loss": 0.0067, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 4209 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.1875, + "epoch": 0.5613333333333334, + "grad_norm": 7.2634596824646, + "kl": 0.2041015625, + "learning_rate": 7.193333333333333e-07, + "loss": 0.0082, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4210 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.3125, + "epoch": 0.5614666666666667, + "grad_norm": 5.06339168548584, + "kl": 0.2021484375, + "learning_rate": 7.192666666666666e-07, + "loss": 0.0081, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4211 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.8125, + "epoch": 0.5616, + "grad_norm": 7.755531311035156, + "kl": 0.275390625, + "learning_rate": 7.191999999999999e-07, + "loss": 0.011, + "reward": 1.375, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 4212 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.1875, + "epoch": 0.5617333333333333, + "grad_norm": 4.33629035949707, + "kl": 0.2568359375, + "learning_rate": 7.191333333333332e-07, + "loss": 0.0103, + "reward": 1.0, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 4213 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.25, + "epoch": 0.5618666666666666, + "grad_norm": 4.718466758728027, + "kl": 0.205078125, + "learning_rate": 7.190666666666666e-07, + "loss": 0.0082, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4214 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.5, + "epoch": 0.562, + "grad_norm": 8.844796180725098, + "kl": 0.17236328125, + "learning_rate": 7.189999999999999e-07, + "loss": 0.0069, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4215 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.8125, + "epoch": 0.5621333333333334, + "grad_norm": 4.400461673736572, + "kl": 0.2373046875, + "learning_rate": 7.189333333333333e-07, + "loss": 0.0095, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4216 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.375, + "epoch": 0.5622666666666667, + "grad_norm": 10.02880573272705, + "kl": 0.2177734375, + "learning_rate": 7.188666666666666e-07, + "loss": 0.0087, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4217 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.125, + "epoch": 0.5624, + "grad_norm": 0.5473690629005432, + "kl": 0.18310546875, + "learning_rate": 7.188e-07, + "loss": 0.0073, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 4218 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.9375, + "epoch": 0.5625333333333333, + "grad_norm": 8.812870979309082, + "kl": 0.25341796875, + "learning_rate": 7.187333333333333e-07, + "loss": 0.0101, + "reward": 1.375, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4219 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.0625, + "epoch": 0.5626666666666666, + "grad_norm": 8.795110702514648, + "kl": 0.21826171875, + "learning_rate": 7.186666666666667e-07, + "loss": 0.0087, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 4220 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.25, + "epoch": 0.5628, + "grad_norm": 5.749368190765381, + "kl": 0.22216796875, + "learning_rate": 7.186e-07, + "loss": 0.0089, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4221 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.4375, + "epoch": 0.5629333333333333, + "grad_norm": 3.702043294906616, + "kl": 0.17626953125, + "learning_rate": 7.185333333333333e-07, + "loss": 0.0071, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4222 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.3125, + "epoch": 0.5630666666666667, + "grad_norm": 0.32275256514549255, + "kl": 0.22607421875, + "learning_rate": 7.184666666666667e-07, + "loss": 0.0091, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4223 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.9375, + "epoch": 0.5632, + "grad_norm": 7.328010082244873, + "kl": 0.2412109375, + "learning_rate": 7.184e-07, + "loss": 0.0096, + "reward": 1.25, + "reward_std": 0.579209566116333, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 4224 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.5, + "epoch": 0.5633333333333334, + "grad_norm": 8.851303100585938, + "kl": 0.2451171875, + "learning_rate": 7.183333333333334e-07, + "loss": 0.0098, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4225 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.8125, + "epoch": 0.5634666666666667, + "grad_norm": 8.596783638000488, + "kl": 0.232421875, + "learning_rate": 7.182666666666667e-07, + "loss": 0.0093, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4226 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.5625, + "epoch": 0.5636, + "grad_norm": 6.199736595153809, + "kl": 0.22802734375, + "learning_rate": 7.182e-07, + "loss": 0.0091, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4227 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.6875, + "epoch": 0.5637333333333333, + "grad_norm": 5.477924823760986, + "kl": 0.2197265625, + "learning_rate": 7.181333333333333e-07, + "loss": 0.0088, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4228 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.375, + "epoch": 0.5638666666666666, + "grad_norm": 5.387711524963379, + "kl": 0.2412109375, + "learning_rate": 7.180666666666667e-07, + "loss": 0.0097, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4229 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.3125, + "epoch": 0.564, + "grad_norm": 7.8408379554748535, + "kl": 0.21533203125, + "learning_rate": 7.179999999999999e-07, + "loss": 0.0086, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4230 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.4375, + "epoch": 0.5641333333333334, + "grad_norm": 6.747129440307617, + "kl": 0.17822265625, + "learning_rate": 7.179333333333332e-07, + "loss": 0.0071, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4231 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.875, + "epoch": 0.5642666666666667, + "grad_norm": 10.091721534729004, + "kl": 0.19970703125, + "learning_rate": 7.178666666666666e-07, + "loss": 0.008, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4232 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.1875, + "epoch": 0.5644, + "grad_norm": 4.934943675994873, + "kl": 0.2490234375, + "learning_rate": 7.177999999999999e-07, + "loss": 0.01, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4233 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.6875, + "epoch": 0.5645333333333333, + "grad_norm": 4.235811233520508, + "kl": 0.18408203125, + "learning_rate": 7.177333333333333e-07, + "loss": 0.0074, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4234 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.1875, + "epoch": 0.5646666666666667, + "grad_norm": 6.277471542358398, + "kl": 0.22021484375, + "learning_rate": 7.176666666666666e-07, + "loss": 0.0088, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4235 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.25, + "epoch": 0.5648, + "grad_norm": 13.610584259033203, + "kl": 0.26806640625, + "learning_rate": 7.176e-07, + "loss": 0.0107, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4236 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.625, + "epoch": 0.5649333333333333, + "grad_norm": 7.088472366333008, + "kl": 0.2255859375, + "learning_rate": 7.175333333333333e-07, + "loss": 0.009, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4237 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.5625, + "epoch": 0.5650666666666667, + "grad_norm": 7.672840118408203, + "kl": 0.35546875, + "learning_rate": 7.174666666666667e-07, + "loss": 0.0142, + "reward": 1.25, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 4238 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.875, + "epoch": 0.5652, + "grad_norm": 10.156291961669922, + "kl": 0.2099609375, + "learning_rate": 7.174e-07, + "loss": 0.0084, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4239 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.1875, + "epoch": 0.5653333333333334, + "grad_norm": 5.871128082275391, + "kl": 0.19677734375, + "learning_rate": 7.173333333333333e-07, + "loss": 0.0079, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4240 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.75, + "epoch": 0.5654666666666667, + "grad_norm": 7.171163082122803, + "kl": 0.19970703125, + "learning_rate": 7.172666666666667e-07, + "loss": 0.008, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4241 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.0, + "epoch": 0.5656, + "grad_norm": 7.718696594238281, + "kl": 0.18359375, + "learning_rate": 7.171999999999999e-07, + "loss": 0.0073, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4242 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.1875, + "epoch": 0.5657333333333333, + "grad_norm": 6.684162616729736, + "kl": 0.16845703125, + "learning_rate": 7.171333333333333e-07, + "loss": 0.0067, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4243 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.4375, + "epoch": 0.5658666666666666, + "grad_norm": 8.20919132232666, + "kl": 0.263671875, + "learning_rate": 7.170666666666666e-07, + "loss": 0.0106, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 4244 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.625, + "epoch": 0.566, + "grad_norm": 0.362820565700531, + "kl": 0.20361328125, + "learning_rate": 7.17e-07, + "loss": 0.0081, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4245 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.8125, + "epoch": 0.5661333333333334, + "grad_norm": 4.654179096221924, + "kl": 0.22021484375, + "learning_rate": 7.169333333333333e-07, + "loss": 0.0088, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4246 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.5, + "epoch": 0.5662666666666667, + "grad_norm": 5.999040126800537, + "kl": 0.25634765625, + "learning_rate": 7.168666666666667e-07, + "loss": 0.0103, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4247 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.8125, + "epoch": 0.5664, + "grad_norm": 6.305879592895508, + "kl": 0.3134765625, + "learning_rate": 7.168e-07, + "loss": 0.0125, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 4248 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.75, + "epoch": 0.5665333333333333, + "grad_norm": 5.886899948120117, + "kl": 0.27392578125, + "learning_rate": 7.167333333333333e-07, + "loss": 0.0109, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 4249 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.25, + "epoch": 0.5666666666666667, + "grad_norm": 5.545045375823975, + "kl": 0.26953125, + "learning_rate": 7.166666666666667e-07, + "loss": 0.0108, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4250 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.25, + "epoch": 0.5668, + "grad_norm": 10.620019912719727, + "kl": 0.181396484375, + "learning_rate": 7.165999999999999e-07, + "loss": 0.0073, + "reward": 1.25, + "reward_std": 0.579209566116333, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 4251 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.1875, + "epoch": 0.5669333333333333, + "grad_norm": 8.127917289733887, + "kl": 0.28662109375, + "learning_rate": 7.165333333333333e-07, + "loss": 0.0115, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4252 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.4375, + "epoch": 0.5670666666666667, + "grad_norm": 4.005194187164307, + "kl": 0.26611328125, + "learning_rate": 7.164666666666666e-07, + "loss": 0.0106, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 4253 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.625, + "epoch": 0.5672, + "grad_norm": 3.8220911026000977, + "kl": 0.18896484375, + "learning_rate": 7.164e-07, + "loss": 0.0076, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4254 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.4375, + "epoch": 0.5673333333333334, + "grad_norm": 17.34773826599121, + "kl": 0.322265625, + "learning_rate": 7.163333333333333e-07, + "loss": 0.0129, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4255 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.5625, + "epoch": 0.5674666666666667, + "grad_norm": 8.428723335266113, + "kl": 0.3095703125, + "learning_rate": 7.162666666666667e-07, + "loss": 0.0124, + "reward": 1.0625, + "reward_std": 0.6739883720874786, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.8125, + "step": 4256 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.4375, + "epoch": 0.5676, + "grad_norm": 11.55212688446045, + "kl": 0.18212890625, + "learning_rate": 7.161999999999999e-07, + "loss": 0.0073, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 4257 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.5625, + "epoch": 0.5677333333333333, + "grad_norm": 4.880269527435303, + "kl": 0.20068359375, + "learning_rate": 7.161333333333332e-07, + "loss": 0.008, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4258 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.75, + "epoch": 0.5678666666666666, + "grad_norm": 0.3692430555820465, + "kl": 0.3388671875, + "learning_rate": 7.160666666666666e-07, + "loss": 0.0136, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4259 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.0, + "epoch": 0.568, + "grad_norm": 3.63777494430542, + "kl": 0.19189453125, + "learning_rate": 7.159999999999999e-07, + "loss": 0.0077, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4260 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.75, + "epoch": 0.5681333333333334, + "grad_norm": 5.935361385345459, + "kl": 0.22998046875, + "learning_rate": 7.159333333333333e-07, + "loss": 0.0092, + "reward": 1.0625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 4261 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.5, + "epoch": 0.5682666666666667, + "grad_norm": 7.65002965927124, + "kl": 0.201171875, + "learning_rate": 7.158666666666666e-07, + "loss": 0.008, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4262 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.6875, + "epoch": 0.5684, + "grad_norm": 11.434136390686035, + "kl": 0.39453125, + "learning_rate": 7.158e-07, + "loss": 0.0158, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 4263 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.125, + "epoch": 0.5685333333333333, + "grad_norm": 6.6069159507751465, + "kl": 0.21728515625, + "learning_rate": 7.157333333333333e-07, + "loss": 0.0087, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4264 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.8125, + "epoch": 0.5686666666666667, + "grad_norm": 5.1787590980529785, + "kl": 0.1640625, + "learning_rate": 7.156666666666667e-07, + "loss": 0.0066, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4265 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.0, + "epoch": 0.5688, + "grad_norm": 0.41534993052482605, + "kl": 0.2421875, + "learning_rate": 7.156e-07, + "loss": 0.0097, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4266 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.5625, + "epoch": 0.5689333333333333, + "grad_norm": 10.56264591217041, + "kl": 0.2783203125, + "learning_rate": 7.155333333333334e-07, + "loss": 0.0111, + "reward": 1.5625, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 4267 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.3125, + "epoch": 0.5690666666666667, + "grad_norm": 0.39264658093452454, + "kl": 0.24462890625, + "learning_rate": 7.154666666666667e-07, + "loss": 0.0098, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4268 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.125, + "epoch": 0.5692, + "grad_norm": 4.634465217590332, + "kl": 0.31884765625, + "learning_rate": 7.154e-07, + "loss": 0.0127, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4269 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.9375, + "epoch": 0.5693333333333334, + "grad_norm": 4.773649215698242, + "kl": 0.19384765625, + "learning_rate": 7.153333333333334e-07, + "loss": 0.0078, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4270 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.0, + "epoch": 0.5694666666666667, + "grad_norm": 7.279715538024902, + "kl": 0.17724609375, + "learning_rate": 7.152666666666667e-07, + "loss": 0.0071, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 4271 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.375, + "epoch": 0.5696, + "grad_norm": 4.639569282531738, + "kl": 0.24853515625, + "learning_rate": 7.151999999999999e-07, + "loss": 0.0099, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4272 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.5, + "epoch": 0.5697333333333333, + "grad_norm": 6.614616870880127, + "kl": 0.14599609375, + "learning_rate": 7.151333333333332e-07, + "loss": 0.0058, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4273 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.625, + "epoch": 0.5698666666666666, + "grad_norm": 8.437848091125488, + "kl": 0.249755859375, + "learning_rate": 7.150666666666666e-07, + "loss": 0.01, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4274 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.6875, + "epoch": 0.57, + "grad_norm": 8.451658248901367, + "kl": 0.24609375, + "learning_rate": 7.149999999999999e-07, + "loss": 0.0098, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4275 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.1875, + "epoch": 0.5701333333333334, + "grad_norm": 7.453205585479736, + "kl": 0.244140625, + "learning_rate": 7.149333333333333e-07, + "loss": 0.0097, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4276 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.0625, + "epoch": 0.5702666666666667, + "grad_norm": 6.973117828369141, + "kl": 0.1943359375, + "learning_rate": 7.148666666666666e-07, + "loss": 0.0078, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4277 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.375, + "epoch": 0.5704, + "grad_norm": 8.340611457824707, + "kl": 0.2587890625, + "learning_rate": 7.147999999999999e-07, + "loss": 0.0104, + "reward": 1.375, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4278 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.0625, + "epoch": 0.5705333333333333, + "grad_norm": 8.041186332702637, + "kl": 0.2724609375, + "learning_rate": 7.147333333333333e-07, + "loss": 0.0109, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 4279 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.875, + "epoch": 0.5706666666666667, + "grad_norm": 7.344924449920654, + "kl": 0.2216796875, + "learning_rate": 7.146666666666666e-07, + "loss": 0.0089, + "reward": 1.1875, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 4280 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.625, + "epoch": 0.5708, + "grad_norm": 4.437053203582764, + "kl": 0.2841796875, + "learning_rate": 7.146e-07, + "loss": 0.0113, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 4281 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.75, + "epoch": 0.5709333333333333, + "grad_norm": 7.638561248779297, + "kl": 0.27490234375, + "learning_rate": 7.145333333333333e-07, + "loss": 0.011, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4282 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.625, + "epoch": 0.5710666666666666, + "grad_norm": 6.5526957511901855, + "kl": 0.29296875, + "learning_rate": 7.144666666666667e-07, + "loss": 0.0117, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4283 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.125, + "epoch": 0.5712, + "grad_norm": 6.668028831481934, + "kl": 0.16259765625, + "learning_rate": 7.144e-07, + "loss": 0.0065, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4284 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.0, + "epoch": 0.5713333333333334, + "grad_norm": 4.334467887878418, + "kl": 0.20947265625, + "learning_rate": 7.143333333333334e-07, + "loss": 0.0084, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4285 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.125, + "epoch": 0.5714666666666667, + "grad_norm": 8.93786907196045, + "kl": 0.34375, + "learning_rate": 7.142666666666667e-07, + "loss": 0.0137, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4286 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.3125, + "epoch": 0.5716, + "grad_norm": 8.079756736755371, + "kl": 0.24560546875, + "learning_rate": 7.141999999999999e-07, + "loss": 0.0098, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4287 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.0, + "epoch": 0.5717333333333333, + "grad_norm": 8.411520957946777, + "kl": 0.22119140625, + "learning_rate": 7.141333333333333e-07, + "loss": 0.0089, + "reward": 1.4375, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4288 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.625, + "epoch": 0.5718666666666666, + "grad_norm": 9.185688018798828, + "kl": 0.21826171875, + "learning_rate": 7.140666666666666e-07, + "loss": 0.0087, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4289 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.5, + "epoch": 0.572, + "grad_norm": 4.838426113128662, + "kl": 0.2060546875, + "learning_rate": 7.14e-07, + "loss": 0.0082, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4290 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.0, + "epoch": 0.5721333333333334, + "grad_norm": 5.551583766937256, + "kl": 0.2734375, + "learning_rate": 7.139333333333333e-07, + "loss": 0.0109, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4291 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.375, + "epoch": 0.5722666666666667, + "grad_norm": 5.564574718475342, + "kl": 0.3232421875, + "learning_rate": 7.138666666666667e-07, + "loss": 0.0129, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4292 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.625, + "epoch": 0.5724, + "grad_norm": 6.140762805938721, + "kl": 0.19287109375, + "learning_rate": 7.137999999999999e-07, + "loss": 0.0077, + "reward": 1.0625, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 4293 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.875, + "epoch": 0.5725333333333333, + "grad_norm": 6.558533191680908, + "kl": 0.21337890625, + "learning_rate": 7.137333333333333e-07, + "loss": 0.0085, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4294 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.875, + "epoch": 0.5726666666666667, + "grad_norm": 4.622015476226807, + "kl": 0.212890625, + "learning_rate": 7.136666666666666e-07, + "loss": 0.0085, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4295 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.25, + "epoch": 0.5728, + "grad_norm": 8.467334747314453, + "kl": 0.30126953125, + "learning_rate": 7.135999999999999e-07, + "loss": 0.0121, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4296 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.1875, + "epoch": 0.5729333333333333, + "grad_norm": 8.978205680847168, + "kl": 0.337890625, + "learning_rate": 7.135333333333333e-07, + "loss": 0.0135, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.875, + "step": 4297 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.625, + "epoch": 0.5730666666666666, + "grad_norm": 7.949334621429443, + "kl": 0.2626953125, + "learning_rate": 7.134666666666666e-07, + "loss": 0.0105, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4298 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.9375, + "epoch": 0.5732, + "grad_norm": 7.467451572418213, + "kl": 0.24609375, + "learning_rate": 7.134e-07, + "loss": 0.0099, + "reward": 1.3125, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 4299 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.375, + "epoch": 0.5733333333333334, + "grad_norm": 5.677906513214111, + "kl": 0.2001953125, + "learning_rate": 7.133333333333333e-07, + "loss": 0.008, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4300 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.1875, + "epoch": 0.5734666666666667, + "grad_norm": 5.948457717895508, + "kl": 0.3984375, + "learning_rate": 7.132666666666667e-07, + "loss": 0.0159, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4301 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.25, + "epoch": 0.5736, + "grad_norm": 7.909080505371094, + "kl": 0.24755859375, + "learning_rate": 7.131999999999999e-07, + "loss": 0.0099, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4302 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.4375, + "epoch": 0.5737333333333333, + "grad_norm": 6.609481334686279, + "kl": 0.2890625, + "learning_rate": 7.131333333333333e-07, + "loss": 0.0116, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4303 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.0625, + "epoch": 0.5738666666666666, + "grad_norm": 25.21428871154785, + "kl": 0.41015625, + "learning_rate": 7.130666666666666e-07, + "loss": 0.0164, + "reward": 1.5, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4304 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.4375, + "epoch": 0.574, + "grad_norm": 8.180609703063965, + "kl": 0.21630859375, + "learning_rate": 7.129999999999999e-07, + "loss": 0.0087, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4305 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.25, + "epoch": 0.5741333333333334, + "grad_norm": 6.8266143798828125, + "kl": 0.20166015625, + "learning_rate": 7.129333333333333e-07, + "loss": 0.008, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4306 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.875, + "epoch": 0.5742666666666667, + "grad_norm": 7.283881187438965, + "kl": 0.22265625, + "learning_rate": 7.128666666666666e-07, + "loss": 0.0089, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4307 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.5625, + "epoch": 0.5744, + "grad_norm": 11.91899585723877, + "kl": 0.24267578125, + "learning_rate": 7.128e-07, + "loss": 0.0097, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4308 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.8125, + "epoch": 0.5745333333333333, + "grad_norm": 3.773951292037964, + "kl": 0.2021484375, + "learning_rate": 7.127333333333333e-07, + "loss": 0.0081, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4309 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.625, + "epoch": 0.5746666666666667, + "grad_norm": 4.129554271697998, + "kl": 0.22216796875, + "learning_rate": 7.126666666666667e-07, + "loss": 0.0089, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4310 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.1875, + "epoch": 0.5748, + "grad_norm": 7.461606025695801, + "kl": 0.1787109375, + "learning_rate": 7.126e-07, + "loss": 0.0071, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4311 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.0625, + "epoch": 0.5749333333333333, + "grad_norm": 5.952764987945557, + "kl": 0.1748046875, + "learning_rate": 7.125333333333334e-07, + "loss": 0.007, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4312 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.0625, + "epoch": 0.5750666666666666, + "grad_norm": 19.11234474182129, + "kl": 0.638671875, + "learning_rate": 7.124666666666667e-07, + "loss": 0.0256, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4313 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.6875, + "epoch": 0.5752, + "grad_norm": 5.422458648681641, + "kl": 0.130615234375, + "learning_rate": 7.124e-07, + "loss": 0.0052, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 4314 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.3125, + "epoch": 0.5753333333333334, + "grad_norm": 4.25908899307251, + "kl": 0.15185546875, + "learning_rate": 7.123333333333333e-07, + "loss": 0.0061, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4315 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.625, + "epoch": 0.5754666666666667, + "grad_norm": 7.330711364746094, + "kl": 0.20263671875, + "learning_rate": 7.122666666666666e-07, + "loss": 0.0081, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4316 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.5625, + "epoch": 0.5756, + "grad_norm": 6.973677635192871, + "kl": 0.2119140625, + "learning_rate": 7.122e-07, + "loss": 0.0085, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4317 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.875, + "epoch": 0.5757333333333333, + "grad_norm": 6.4275641441345215, + "kl": 0.1953125, + "learning_rate": 7.121333333333332e-07, + "loss": 0.0078, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4318 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.4375, + "epoch": 0.5758666666666666, + "grad_norm": 4.206933498382568, + "kl": 0.27099609375, + "learning_rate": 7.120666666666666e-07, + "loss": 0.0108, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4319 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.125, + "epoch": 0.576, + "grad_norm": 5.638805866241455, + "kl": 0.21533203125, + "learning_rate": 7.119999999999999e-07, + "loss": 0.0086, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4320 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.5, + "epoch": 0.5761333333333334, + "grad_norm": 7.542486190795898, + "kl": 0.2509765625, + "learning_rate": 7.119333333333333e-07, + "loss": 0.01, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4321 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.75, + "epoch": 0.5762666666666667, + "grad_norm": 5.245857238769531, + "kl": 0.2060546875, + "learning_rate": 7.118666666666666e-07, + "loss": 0.0082, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4322 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.625, + "epoch": 0.5764, + "grad_norm": 10.065956115722656, + "kl": 0.7470703125, + "learning_rate": 7.118e-07, + "loss": 0.03, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4323 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.9375, + "epoch": 0.5765333333333333, + "grad_norm": 11.971638679504395, + "kl": 0.25048828125, + "learning_rate": 7.117333333333333e-07, + "loss": 0.01, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4324 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.0, + "epoch": 0.5766666666666667, + "grad_norm": 5.6313300132751465, + "kl": 0.291015625, + "learning_rate": 7.116666666666666e-07, + "loss": 0.0117, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4325 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.1875, + "epoch": 0.5768, + "grad_norm": 4.001108169555664, + "kl": 0.2216796875, + "learning_rate": 7.116e-07, + "loss": 0.0089, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4326 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.625, + "epoch": 0.5769333333333333, + "grad_norm": 5.808693885803223, + "kl": 0.25537109375, + "learning_rate": 7.115333333333333e-07, + "loss": 0.0102, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 4327 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.1875, + "epoch": 0.5770666666666666, + "grad_norm": 0.7862417101860046, + "kl": 0.31201171875, + "learning_rate": 7.114666666666667e-07, + "loss": 0.0125, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4328 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.875, + "epoch": 0.5772, + "grad_norm": 10.165493965148926, + "kl": 0.28271484375, + "learning_rate": 7.114e-07, + "loss": 0.0113, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4329 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.25, + "epoch": 0.5773333333333334, + "grad_norm": 5.0397725105285645, + "kl": 0.23876953125, + "learning_rate": 7.113333333333334e-07, + "loss": 0.0096, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4330 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.25, + "epoch": 0.5774666666666667, + "grad_norm": 8.180802345275879, + "kl": 0.16796875, + "learning_rate": 7.112666666666667e-07, + "loss": 0.0067, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 4331 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.375, + "epoch": 0.5776, + "grad_norm": 4.429012775421143, + "kl": 0.2060546875, + "learning_rate": 7.112000000000001e-07, + "loss": 0.0082, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4332 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.8125, + "epoch": 0.5777333333333333, + "grad_norm": 5.43139123916626, + "kl": 0.318359375, + "learning_rate": 7.111333333333333e-07, + "loss": 0.0128, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4333 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.6875, + "epoch": 0.5778666666666666, + "grad_norm": 9.198626518249512, + "kl": 0.36376953125, + "learning_rate": 7.110666666666665e-07, + "loss": 0.0145, + "reward": 1.1875, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 4334 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.75, + "epoch": 0.578, + "grad_norm": 8.200366020202637, + "kl": 0.23974609375, + "learning_rate": 7.11e-07, + "loss": 0.0096, + "reward": 1.3125, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 4335 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.875, + "epoch": 0.5781333333333334, + "grad_norm": 6.803932189941406, + "kl": 0.14990234375, + "learning_rate": 7.109333333333332e-07, + "loss": 0.006, + "reward": 1.5625, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 4336 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.1875, + "epoch": 0.5782666666666667, + "grad_norm": 7.139333248138428, + "kl": 0.263671875, + "learning_rate": 7.108666666666666e-07, + "loss": 0.0106, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4337 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.0, + "epoch": 0.5784, + "grad_norm": 0.38059088587760925, + "kl": 0.18701171875, + "learning_rate": 7.107999999999999e-07, + "loss": 0.0075, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4338 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.125, + "epoch": 0.5785333333333333, + "grad_norm": 8.129098892211914, + "kl": 0.25341796875, + "learning_rate": 7.107333333333333e-07, + "loss": 0.0102, + "reward": 1.625, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4339 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.25, + "epoch": 0.5786666666666667, + "grad_norm": 59.79313659667969, + "kl": 0.1904296875, + "learning_rate": 7.106666666666666e-07, + "loss": 0.0076, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4340 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.25, + "epoch": 0.5788, + "grad_norm": 5.163728713989258, + "kl": 0.26806640625, + "learning_rate": 7.106e-07, + "loss": 0.0107, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4341 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.6875, + "epoch": 0.5789333333333333, + "grad_norm": 6.44206428527832, + "kl": 0.19091796875, + "learning_rate": 7.105333333333333e-07, + "loss": 0.0076, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4342 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.875, + "epoch": 0.5790666666666666, + "grad_norm": 0.32891589403152466, + "kl": 0.21630859375, + "learning_rate": 7.104666666666666e-07, + "loss": 0.0087, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4343 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.625, + "epoch": 0.5792, + "grad_norm": 45.71721267700195, + "kl": 0.2333984375, + "learning_rate": 7.104e-07, + "loss": 0.0093, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4344 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.25, + "epoch": 0.5793333333333334, + "grad_norm": 6.068707466125488, + "kl": 0.162109375, + "learning_rate": 7.103333333333333e-07, + "loss": 0.0065, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4345 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.5625, + "epoch": 0.5794666666666667, + "grad_norm": 11.958856582641602, + "kl": 0.21923828125, + "learning_rate": 7.102666666666667e-07, + "loss": 0.0088, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4346 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.125, + "epoch": 0.5796, + "grad_norm": 8.726551055908203, + "kl": 0.18408203125, + "learning_rate": 7.102e-07, + "loss": 0.0073, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4347 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.1875, + "epoch": 0.5797333333333333, + "grad_norm": 9.798240661621094, + "kl": 0.2197265625, + "learning_rate": 7.101333333333333e-07, + "loss": 0.0088, + "reward": 1.1875, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 4348 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.375, + "epoch": 0.5798666666666666, + "grad_norm": 7.58344030380249, + "kl": 0.19189453125, + "learning_rate": 7.100666666666666e-07, + "loss": 0.0077, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 4349 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.0, + "epoch": 0.58, + "grad_norm": 9.489968299865723, + "kl": 0.2763671875, + "learning_rate": 7.1e-07, + "loss": 0.0111, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 4350 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.5, + "epoch": 0.5801333333333333, + "grad_norm": 7.586558818817139, + "kl": 0.255859375, + "learning_rate": 7.099333333333333e-07, + "loss": 0.0103, + "reward": 1.3125, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 4351 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.0, + "epoch": 0.5802666666666667, + "grad_norm": 6.4947309494018555, + "kl": 0.1640625, + "learning_rate": 7.098666666666666e-07, + "loss": 0.0066, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 4352 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.0625, + "epoch": 0.5804, + "grad_norm": 0.325535386800766, + "kl": 0.17578125, + "learning_rate": 7.098e-07, + "loss": 0.007, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4353 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.8125, + "epoch": 0.5805333333333333, + "grad_norm": 5.905966281890869, + "kl": 0.23095703125, + "learning_rate": 7.097333333333333e-07, + "loss": 0.0092, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4354 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.8125, + "epoch": 0.5806666666666667, + "grad_norm": 7.083034992218018, + "kl": 0.2470703125, + "learning_rate": 7.096666666666667e-07, + "loss": 0.0099, + "reward": 1.5625, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4355 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.875, + "epoch": 0.5808, + "grad_norm": 6.519813060760498, + "kl": 0.1591796875, + "learning_rate": 7.096e-07, + "loss": 0.0064, + "reward": 1.375, + "reward_std": 0.579209566116333, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4356 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.3125, + "epoch": 0.5809333333333333, + "grad_norm": 4.871115207672119, + "kl": 0.14453125, + "learning_rate": 7.095333333333333e-07, + "loss": 0.0058, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4357 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.1875, + "epoch": 0.5810666666666666, + "grad_norm": 5.047364711761475, + "kl": 0.2392578125, + "learning_rate": 7.094666666666666e-07, + "loss": 0.0096, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4358 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.25, + "epoch": 0.5812, + "grad_norm": 7.052008628845215, + "kl": 0.22998046875, + "learning_rate": 7.094e-07, + "loss": 0.0092, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 4359 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.6875, + "epoch": 0.5813333333333334, + "grad_norm": 5.1892900466918945, + "kl": 0.22265625, + "learning_rate": 7.093333333333333e-07, + "loss": 0.0089, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4360 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.9375, + "epoch": 0.5814666666666667, + "grad_norm": 4.5992021560668945, + "kl": 0.24609375, + "learning_rate": 7.092666666666667e-07, + "loss": 0.0098, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4361 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.0, + "epoch": 0.5816, + "grad_norm": 6.548433780670166, + "kl": 0.18017578125, + "learning_rate": 7.092e-07, + "loss": 0.0072, + "reward": 1.625, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4362 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.3125, + "epoch": 0.5817333333333333, + "grad_norm": 6.337824821472168, + "kl": 0.16357421875, + "learning_rate": 7.091333333333332e-07, + "loss": 0.0065, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4363 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.8125, + "epoch": 0.5818666666666666, + "grad_norm": 9.506858825683594, + "kl": 0.2021484375, + "learning_rate": 7.090666666666666e-07, + "loss": 0.0081, + "reward": 1.1875, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 4364 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.3125, + "epoch": 0.582, + "grad_norm": 4.872460842132568, + "kl": 0.2958984375, + "learning_rate": 7.089999999999999e-07, + "loss": 0.0118, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4365 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.5625, + "epoch": 0.5821333333333333, + "grad_norm": 7.908537864685059, + "kl": 0.24755859375, + "learning_rate": 7.089333333333333e-07, + "loss": 0.0099, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 4366 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.875, + "epoch": 0.5822666666666667, + "grad_norm": 8.45768928527832, + "kl": 0.18603515625, + "learning_rate": 7.088666666666666e-07, + "loss": 0.0074, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 4367 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.1875, + "epoch": 0.5824, + "grad_norm": 5.338008403778076, + "kl": 0.24755859375, + "learning_rate": 7.088e-07, + "loss": 0.0099, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 4368 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.1875, + "epoch": 0.5825333333333333, + "grad_norm": 29.047727584838867, + "kl": 0.2080078125, + "learning_rate": 7.087333333333333e-07, + "loss": 0.0083, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4369 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.0625, + "epoch": 0.5826666666666667, + "grad_norm": 0.4948374032974243, + "kl": 0.244140625, + "learning_rate": 7.086666666666667e-07, + "loss": 0.0098, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4370 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.6875, + "epoch": 0.5828, + "grad_norm": 8.085284233093262, + "kl": 0.279296875, + "learning_rate": 7.086e-07, + "loss": 0.0112, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 4371 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.0, + "epoch": 0.5829333333333333, + "grad_norm": 6.924615859985352, + "kl": 0.2119140625, + "learning_rate": 7.085333333333333e-07, + "loss": 0.0085, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4372 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.4375, + "epoch": 0.5830666666666666, + "grad_norm": 5.388528347015381, + "kl": 0.1552734375, + "learning_rate": 7.084666666666667e-07, + "loss": 0.0062, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4373 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.4375, + "epoch": 0.5832, + "grad_norm": 7.549252986907959, + "kl": 0.275390625, + "learning_rate": 7.084e-07, + "loss": 0.011, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4374 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.875, + "epoch": 0.5833333333333334, + "grad_norm": 5.74502420425415, + "kl": 0.244140625, + "learning_rate": 7.083333333333334e-07, + "loss": 0.0097, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.5, + "epoch": 0.5834666666666667, + "grad_norm": 9.111069679260254, + "kl": 0.169921875, + "learning_rate": 7.082666666666667e-07, + "loss": 0.0068, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 4376 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.1875, + "epoch": 0.5836, + "grad_norm": 6.132711410522461, + "kl": 0.2666015625, + "learning_rate": 7.082000000000001e-07, + "loss": 0.0107, + "reward": 1.6875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 4377 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.8125, + "epoch": 0.5837333333333333, + "grad_norm": 0.43934449553489685, + "kl": 0.22314453125, + "learning_rate": 7.081333333333332e-07, + "loss": 0.0089, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4378 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.0, + "epoch": 0.5838666666666666, + "grad_norm": 0.4188874065876007, + "kl": 0.2890625, + "learning_rate": 7.080666666666666e-07, + "loss": 0.0115, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4379 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.4375, + "epoch": 0.584, + "grad_norm": 0.629271924495697, + "kl": 0.3212890625, + "learning_rate": 7.079999999999999e-07, + "loss": 0.0129, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4380 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.1875, + "epoch": 0.5841333333333333, + "grad_norm": 4.566792011260986, + "kl": 0.163818359375, + "learning_rate": 7.079333333333332e-07, + "loss": 0.0065, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4381 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.4375, + "epoch": 0.5842666666666667, + "grad_norm": 7.821908950805664, + "kl": 0.26416015625, + "learning_rate": 7.078666666666666e-07, + "loss": 0.0106, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4382 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.9375, + "epoch": 0.5844, + "grad_norm": 8.506772994995117, + "kl": 0.240234375, + "learning_rate": 7.077999999999999e-07, + "loss": 0.0096, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4383 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.25, + "epoch": 0.5845333333333333, + "grad_norm": 7.49269437789917, + "kl": 0.2197265625, + "learning_rate": 7.077333333333333e-07, + "loss": 0.0088, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4384 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.75, + "epoch": 0.5846666666666667, + "grad_norm": 4.5764641761779785, + "kl": 0.15380859375, + "learning_rate": 7.076666666666666e-07, + "loss": 0.0062, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 4385 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.625, + "epoch": 0.5848, + "grad_norm": 7.112590789794922, + "kl": 0.3662109375, + "learning_rate": 7.076e-07, + "loss": 0.0147, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4386 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.0, + "epoch": 0.5849333333333333, + "grad_norm": 6.1346211433410645, + "kl": 0.15771484375, + "learning_rate": 7.075333333333333e-07, + "loss": 0.0063, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4387 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.25, + "epoch": 0.5850666666666666, + "grad_norm": 5.604811668395996, + "kl": 0.216796875, + "learning_rate": 7.074666666666667e-07, + "loss": 0.0087, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4388 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.3125, + "epoch": 0.5852, + "grad_norm": 6.449929237365723, + "kl": 0.1904296875, + "learning_rate": 7.074e-07, + "loss": 0.0076, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4389 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.25, + "epoch": 0.5853333333333334, + "grad_norm": 5.6096649169921875, + "kl": 0.1474609375, + "learning_rate": 7.073333333333333e-07, + "loss": 0.0059, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4390 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.5, + "epoch": 0.5854666666666667, + "grad_norm": 9.662385940551758, + "kl": 0.19189453125, + "learning_rate": 7.072666666666667e-07, + "loss": 0.0077, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4391 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.0, + "epoch": 0.5856, + "grad_norm": 4.3683624267578125, + "kl": 0.17041015625, + "learning_rate": 7.072e-07, + "loss": 0.0068, + "reward": 1.6875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 4392 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.9375, + "epoch": 0.5857333333333333, + "grad_norm": 7.360645771026611, + "kl": 0.2177734375, + "learning_rate": 7.071333333333333e-07, + "loss": 0.0087, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4393 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.3125, + "epoch": 0.5858666666666666, + "grad_norm": 6.987362861633301, + "kl": 0.35986328125, + "learning_rate": 7.070666666666666e-07, + "loss": 0.0144, + "reward": 1.125, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.875, + "step": 4394 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.8125, + "epoch": 0.586, + "grad_norm": 6.403764247894287, + "kl": 0.2021484375, + "learning_rate": 7.07e-07, + "loss": 0.0081, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4395 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.5, + "epoch": 0.5861333333333333, + "grad_norm": 5.69639778137207, + "kl": 0.263671875, + "learning_rate": 7.069333333333333e-07, + "loss": 0.0105, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4396 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.375, + "epoch": 0.5862666666666667, + "grad_norm": 5.557440280914307, + "kl": 0.18017578125, + "learning_rate": 7.068666666666667e-07, + "loss": 0.0072, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 4397 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.9375, + "epoch": 0.5864, + "grad_norm": 6.471135139465332, + "kl": 0.263671875, + "learning_rate": 7.068e-07, + "loss": 0.0106, + "reward": 1.0625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 4398 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.8125, + "epoch": 0.5865333333333334, + "grad_norm": 5.876548767089844, + "kl": 0.21728515625, + "learning_rate": 7.067333333333332e-07, + "loss": 0.0087, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4399 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.75, + "epoch": 0.5866666666666667, + "grad_norm": 12.03738021850586, + "kl": 0.2841796875, + "learning_rate": 7.066666666666666e-07, + "loss": 0.0114, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4400 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.5625, + "epoch": 0.5868, + "grad_norm": 4.780569553375244, + "kl": 0.18115234375, + "learning_rate": 7.065999999999999e-07, + "loss": 0.0073, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4401 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.125, + "epoch": 0.5869333333333333, + "grad_norm": 4.547854423522949, + "kl": 0.2158203125, + "learning_rate": 7.065333333333333e-07, + "loss": 0.0086, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4402 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.375, + "epoch": 0.5870666666666666, + "grad_norm": 4.207927703857422, + "kl": 0.19970703125, + "learning_rate": 7.064666666666666e-07, + "loss": 0.008, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4403 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.0, + "epoch": 0.5872, + "grad_norm": 6.733548164367676, + "kl": 0.18212890625, + "learning_rate": 7.064e-07, + "loss": 0.0073, + "reward": 1.6875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 4404 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.625, + "epoch": 0.5873333333333334, + "grad_norm": 9.301236152648926, + "kl": 0.2314453125, + "learning_rate": 7.063333333333333e-07, + "loss": 0.0093, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4405 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.6875, + "epoch": 0.5874666666666667, + "grad_norm": 5.254326343536377, + "kl": 0.17626953125, + "learning_rate": 7.062666666666667e-07, + "loss": 0.0071, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4406 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.3125, + "epoch": 0.5876, + "grad_norm": 3.5205318927764893, + "kl": 0.18505859375, + "learning_rate": 7.062e-07, + "loss": 0.0074, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4407 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.375, + "epoch": 0.5877333333333333, + "grad_norm": 5.680309295654297, + "kl": 0.13427734375, + "learning_rate": 7.061333333333332e-07, + "loss": 0.0054, + "reward": 1.5625, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 4408 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.6875, + "epoch": 0.5878666666666666, + "grad_norm": 5.040436744689941, + "kl": 0.1611328125, + "learning_rate": 7.060666666666666e-07, + "loss": 0.0065, + "reward": 1.3125, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 4409 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.8125, + "epoch": 0.588, + "grad_norm": 4.776744365692139, + "kl": 0.17822265625, + "learning_rate": 7.059999999999999e-07, + "loss": 0.0071, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4410 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.875, + "epoch": 0.5881333333333333, + "grad_norm": 7.349093437194824, + "kl": 0.19921875, + "learning_rate": 7.059333333333333e-07, + "loss": 0.008, + "reward": 1.0625, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 4411 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.75, + "epoch": 0.5882666666666667, + "grad_norm": 5.5647077560424805, + "kl": 0.137451171875, + "learning_rate": 7.058666666666666e-07, + "loss": 0.0055, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4412 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.3125, + "epoch": 0.5884, + "grad_norm": 3.6471738815307617, + "kl": 0.1572265625, + "learning_rate": 7.058e-07, + "loss": 0.0063, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4413 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.5, + "epoch": 0.5885333333333334, + "grad_norm": 5.526362419128418, + "kl": 0.20947265625, + "learning_rate": 7.057333333333333e-07, + "loss": 0.0084, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 4414 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.625, + "epoch": 0.5886666666666667, + "grad_norm": 6.65897274017334, + "kl": 0.29541015625, + "learning_rate": 7.056666666666667e-07, + "loss": 0.0118, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 4415 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.375, + "epoch": 0.5888, + "grad_norm": 6.715798377990723, + "kl": 0.18994140625, + "learning_rate": 7.056e-07, + "loss": 0.0076, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 4416 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.5, + "epoch": 0.5889333333333333, + "grad_norm": 6.320472240447998, + "kl": 0.14697265625, + "learning_rate": 7.055333333333334e-07, + "loss": 0.0059, + "reward": 1.125, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 4417 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.1875, + "epoch": 0.5890666666666666, + "grad_norm": 11.487939834594727, + "kl": 0.28515625, + "learning_rate": 7.054666666666667e-07, + "loss": 0.0114, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.875, + "step": 4418 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.125, + "epoch": 0.5892, + "grad_norm": 8.306568145751953, + "kl": 0.22412109375, + "learning_rate": 7.054e-07, + "loss": 0.009, + "reward": 1.3125, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 4419 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.3125, + "epoch": 0.5893333333333334, + "grad_norm": 29.299409866333008, + "kl": 0.1953125, + "learning_rate": 7.053333333333333e-07, + "loss": 0.0078, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4420 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.625, + "epoch": 0.5894666666666667, + "grad_norm": 7.254203796386719, + "kl": 0.158203125, + "learning_rate": 7.052666666666666e-07, + "loss": 0.0063, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4421 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.25, + "epoch": 0.5896, + "grad_norm": 7.117905616760254, + "kl": 0.13330078125, + "learning_rate": 7.052e-07, + "loss": 0.0053, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4422 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.5, + "epoch": 0.5897333333333333, + "grad_norm": 7.1562957763671875, + "kl": 0.16552734375, + "learning_rate": 7.051333333333332e-07, + "loss": 0.0066, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4423 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.375, + "epoch": 0.5898666666666667, + "grad_norm": 0.4599822163581848, + "kl": 0.2333984375, + "learning_rate": 7.050666666666666e-07, + "loss": 0.0093, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4424 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.0, + "epoch": 0.59, + "grad_norm": 13.832315444946289, + "kl": 0.34716796875, + "learning_rate": 7.049999999999999e-07, + "loss": 0.0139, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 4425 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.625, + "epoch": 0.5901333333333333, + "grad_norm": 12.827072143554688, + "kl": 0.31494140625, + "learning_rate": 7.049333333333333e-07, + "loss": 0.0126, + "reward": 1.4375, + "reward_std": 0.6739883720874786, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 4426 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.625, + "epoch": 0.5902666666666667, + "grad_norm": 6.2504072189331055, + "kl": 0.294921875, + "learning_rate": 7.048666666666666e-07, + "loss": 0.0118, + "reward": 1.5625, + "reward_std": 0.4172614812850952, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 4427 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.0625, + "epoch": 0.5904, + "grad_norm": 6.763465881347656, + "kl": 0.177734375, + "learning_rate": 7.047999999999999e-07, + "loss": 0.0071, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4428 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.375, + "epoch": 0.5905333333333334, + "grad_norm": 6.6785759925842285, + "kl": 0.1787109375, + "learning_rate": 7.047333333333333e-07, + "loss": 0.0072, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4429 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.125, + "epoch": 0.5906666666666667, + "grad_norm": 6.780340671539307, + "kl": 0.2783203125, + "learning_rate": 7.046666666666666e-07, + "loss": 0.0111, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 4430 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.4375, + "epoch": 0.5908, + "grad_norm": 6.654054641723633, + "kl": 0.234375, + "learning_rate": 7.046e-07, + "loss": 0.0094, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 4431 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.5, + "epoch": 0.5909333333333333, + "grad_norm": 6.945515155792236, + "kl": 0.16015625, + "learning_rate": 7.045333333333333e-07, + "loss": 0.0064, + "reward": 1.3125, + "reward_std": 0.5876962244510651, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 4432 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.0625, + "epoch": 0.5910666666666666, + "grad_norm": 5.999631881713867, + "kl": 0.1650390625, + "learning_rate": 7.044666666666667e-07, + "loss": 0.0066, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4433 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.8125, + "epoch": 0.5912, + "grad_norm": 7.282989025115967, + "kl": 0.203125, + "learning_rate": 7.044e-07, + "loss": 0.0081, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 4434 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.25, + "epoch": 0.5913333333333334, + "grad_norm": 14.333822250366211, + "kl": 0.228515625, + "learning_rate": 7.043333333333334e-07, + "loss": 0.0091, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4435 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.75, + "epoch": 0.5914666666666667, + "grad_norm": 8.196714401245117, + "kl": 0.17724609375, + "learning_rate": 7.042666666666667e-07, + "loss": 0.0071, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4436 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.1875, + "epoch": 0.5916, + "grad_norm": 7.6536383628845215, + "kl": 0.14453125, + "learning_rate": 7.042e-07, + "loss": 0.0058, + "reward": 1.375, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 4437 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.875, + "epoch": 0.5917333333333333, + "grad_norm": 31.23998260498047, + "kl": 0.22802734375, + "learning_rate": 7.041333333333334e-07, + "loss": 0.0091, + "reward": 1.6875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 4438 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.75, + "epoch": 0.5918666666666667, + "grad_norm": 2.5725467205047607, + "kl": 0.17041015625, + "learning_rate": 7.040666666666666e-07, + "loss": 0.0068, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 4439 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.9375, + "epoch": 0.592, + "grad_norm": 7.445695400238037, + "kl": 0.14453125, + "learning_rate": 7.04e-07, + "loss": 0.0058, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4440 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.0, + "epoch": 0.5921333333333333, + "grad_norm": 7.5326008796691895, + "kl": 0.2138671875, + "learning_rate": 7.039333333333332e-07, + "loss": 0.0086, + "reward": 1.0625, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 4441 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.5, + "epoch": 0.5922666666666667, + "grad_norm": 124.47219848632812, + "kl": 0.23291015625, + "learning_rate": 7.038666666666666e-07, + "loss": 0.0093, + "reward": 1.25, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 4442 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.5, + "epoch": 0.5924, + "grad_norm": 10.09389877319336, + "kl": 0.35888671875, + "learning_rate": 7.037999999999999e-07, + "loss": 0.0144, + "reward": 1.1875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 4443 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.75, + "epoch": 0.5925333333333334, + "grad_norm": 5.017742156982422, + "kl": 0.122314453125, + "learning_rate": 7.037333333333333e-07, + "loss": 0.0049, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 4444 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.6875, + "epoch": 0.5926666666666667, + "grad_norm": 8.411994934082031, + "kl": 0.3466796875, + "learning_rate": 7.036666666666666e-07, + "loss": 0.0138, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4445 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.125, + "epoch": 0.5928, + "grad_norm": 6.661052703857422, + "kl": 0.15087890625, + "learning_rate": 7.035999999999999e-07, + "loss": 0.006, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4446 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.8125, + "epoch": 0.5929333333333333, + "grad_norm": 3.9798059463500977, + "kl": 0.17431640625, + "learning_rate": 7.035333333333333e-07, + "loss": 0.007, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4447 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.6875, + "epoch": 0.5930666666666666, + "grad_norm": 7.04092264175415, + "kl": 0.18896484375, + "learning_rate": 7.034666666666666e-07, + "loss": 0.0076, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4448 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.875, + "epoch": 0.5932, + "grad_norm": 4.223150253295898, + "kl": 0.123291015625, + "learning_rate": 7.034e-07, + "loss": 0.0049, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4449 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.1875, + "epoch": 0.5933333333333334, + "grad_norm": 4.187964916229248, + "kl": 0.177734375, + "learning_rate": 7.033333333333333e-07, + "loss": 0.0071, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4450 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.5, + "epoch": 0.5934666666666667, + "grad_norm": 7.058931350708008, + "kl": 0.14990234375, + "learning_rate": 7.032666666666667e-07, + "loss": 0.006, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4451 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.125, + "epoch": 0.5936, + "grad_norm": 7.406148910522461, + "kl": 0.23583984375, + "learning_rate": 7.032e-07, + "loss": 0.0094, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4452 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.0, + "epoch": 0.5937333333333333, + "grad_norm": 4.374039173126221, + "kl": 0.24072265625, + "learning_rate": 7.031333333333334e-07, + "loss": 0.0096, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4453 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.25, + "epoch": 0.5938666666666667, + "grad_norm": 4.781127452850342, + "kl": 0.171875, + "learning_rate": 7.030666666666666e-07, + "loss": 0.0069, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4454 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.25, + "epoch": 0.594, + "grad_norm": 5.923183441162109, + "kl": 0.2568359375, + "learning_rate": 7.029999999999999e-07, + "loss": 0.0103, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4455 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.125, + "epoch": 0.5941333333333333, + "grad_norm": 7.039451599121094, + "kl": 0.142333984375, + "learning_rate": 7.029333333333333e-07, + "loss": 0.0057, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4456 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.0, + "epoch": 0.5942666666666667, + "grad_norm": 4.045903205871582, + "kl": 0.18701171875, + "learning_rate": 7.028666666666666e-07, + "loss": 0.0075, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4457 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.6875, + "epoch": 0.5944, + "grad_norm": 6.373176097869873, + "kl": 0.177734375, + "learning_rate": 7.028e-07, + "loss": 0.0071, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4458 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.0, + "epoch": 0.5945333333333334, + "grad_norm": 10.359851837158203, + "kl": 0.21728515625, + "learning_rate": 7.027333333333333e-07, + "loss": 0.0087, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4459 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.1875, + "epoch": 0.5946666666666667, + "grad_norm": 4.371721267700195, + "kl": 0.1767578125, + "learning_rate": 7.026666666666667e-07, + "loss": 0.0071, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4460 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.375, + "epoch": 0.5948, + "grad_norm": 4.185277938842773, + "kl": 0.14794921875, + "learning_rate": 7.026e-07, + "loss": 0.0059, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4461 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.5, + "epoch": 0.5949333333333333, + "grad_norm": 8.26067066192627, + "kl": 0.26025390625, + "learning_rate": 7.025333333333334e-07, + "loss": 0.0104, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4462 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.6875, + "epoch": 0.5950666666666666, + "grad_norm": 7.63811731338501, + "kl": 0.22119140625, + "learning_rate": 7.024666666666666e-07, + "loss": 0.0089, + "reward": 1.3125, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 4463 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.25, + "epoch": 0.5952, + "grad_norm": 8.139541625976562, + "kl": 0.2080078125, + "learning_rate": 7.024e-07, + "loss": 0.0083, + "reward": 1.25, + "reward_std": 0.6760360598564148, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 4464 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.75, + "epoch": 0.5953333333333334, + "grad_norm": 8.233163833618164, + "kl": 0.17626953125, + "learning_rate": 7.023333333333333e-07, + "loss": 0.0071, + "reward": 1.1875, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 4465 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.875, + "epoch": 0.5954666666666667, + "grad_norm": 5.9869208335876465, + "kl": 0.1513671875, + "learning_rate": 7.022666666666666e-07, + "loss": 0.0061, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4466 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.625, + "epoch": 0.5956, + "grad_norm": 4.343156337738037, + "kl": 0.26708984375, + "learning_rate": 7.022e-07, + "loss": 0.0107, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4467 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.1875, + "epoch": 0.5957333333333333, + "grad_norm": 9.066656112670898, + "kl": 0.3154296875, + "learning_rate": 7.021333333333333e-07, + "loss": 0.0126, + "reward": 1.3125, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 4468 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.25, + "epoch": 0.5958666666666667, + "grad_norm": 5.673039436340332, + "kl": 0.189453125, + "learning_rate": 7.020666666666666e-07, + "loss": 0.0076, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4469 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.0, + "epoch": 0.596, + "grad_norm": 7.229302883148193, + "kl": 0.2939453125, + "learning_rate": 7.019999999999999e-07, + "loss": 0.0118, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 4470 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.4375, + "epoch": 0.5961333333333333, + "grad_norm": 7.429739952087402, + "kl": 0.173828125, + "learning_rate": 7.019333333333333e-07, + "loss": 0.0069, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4471 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.0, + "epoch": 0.5962666666666666, + "grad_norm": 3.914323091506958, + "kl": 0.138671875, + "learning_rate": 7.018666666666666e-07, + "loss": 0.0056, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 4472 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.75, + "epoch": 0.5964, + "grad_norm": 5.611045837402344, + "kl": 0.20263671875, + "learning_rate": 7.018e-07, + "loss": 0.0081, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 4473 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.4375, + "epoch": 0.5965333333333334, + "grad_norm": 7.418447017669678, + "kl": 0.2705078125, + "learning_rate": 7.017333333333333e-07, + "loss": 0.0108, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4474 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.75, + "epoch": 0.5966666666666667, + "grad_norm": 8.203641891479492, + "kl": 0.18115234375, + "learning_rate": 7.016666666666666e-07, + "loss": 0.0073, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4475 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.8125, + "epoch": 0.5968, + "grad_norm": 7.028825759887695, + "kl": 0.18505859375, + "learning_rate": 7.016e-07, + "loss": 0.0074, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4476 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.875, + "epoch": 0.5969333333333333, + "grad_norm": 7.498913288116455, + "kl": 0.236328125, + "learning_rate": 7.015333333333333e-07, + "loss": 0.0094, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4477 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.0, + "epoch": 0.5970666666666666, + "grad_norm": 4.847031593322754, + "kl": 0.19873046875, + "learning_rate": 7.014666666666667e-07, + "loss": 0.0079, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4478 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.75, + "epoch": 0.5972, + "grad_norm": 10.253063201904297, + "kl": 0.3271484375, + "learning_rate": 7.014e-07, + "loss": 0.0131, + "reward": 1.4375, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4479 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.25, + "epoch": 0.5973333333333334, + "grad_norm": 11.24055290222168, + "kl": 0.34912109375, + "learning_rate": 7.013333333333334e-07, + "loss": 0.0139, + "reward": 1.625, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4480 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.625, + "epoch": 0.5974666666666667, + "grad_norm": 4.647652626037598, + "kl": 0.122802734375, + "learning_rate": 7.012666666666667e-07, + "loss": 0.0049, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4481 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.6875, + "epoch": 0.5976, + "grad_norm": 0.336357980966568, + "kl": 0.21484375, + "learning_rate": 7.012000000000001e-07, + "loss": 0.0086, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4482 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.9375, + "epoch": 0.5977333333333333, + "grad_norm": 5.9411211013793945, + "kl": 0.16455078125, + "learning_rate": 7.011333333333334e-07, + "loss": 0.0066, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4483 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.375, + "epoch": 0.5978666666666667, + "grad_norm": 34.109153747558594, + "kl": 0.25146484375, + "learning_rate": 7.010666666666665e-07, + "loss": 0.01, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4484 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.6875, + "epoch": 0.598, + "grad_norm": 6.859610557556152, + "kl": 0.19970703125, + "learning_rate": 7.009999999999999e-07, + "loss": 0.008, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4485 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.8125, + "epoch": 0.5981333333333333, + "grad_norm": 9.25366497039795, + "kl": 0.25439453125, + "learning_rate": 7.009333333333332e-07, + "loss": 0.0102, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 4486 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.4375, + "epoch": 0.5982666666666666, + "grad_norm": 4.697730541229248, + "kl": 0.26220703125, + "learning_rate": 7.008666666666666e-07, + "loss": 0.0105, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4487 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.25, + "epoch": 0.5984, + "grad_norm": 7.905256271362305, + "kl": 0.16796875, + "learning_rate": 7.007999999999999e-07, + "loss": 0.0067, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4488 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.125, + "epoch": 0.5985333333333334, + "grad_norm": 7.510989665985107, + "kl": 0.1953125, + "learning_rate": 7.007333333333333e-07, + "loss": 0.0078, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4489 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.8125, + "epoch": 0.5986666666666667, + "grad_norm": 6.104791164398193, + "kl": 0.2109375, + "learning_rate": 7.006666666666666e-07, + "loss": 0.0084, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4490 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.625, + "epoch": 0.5988, + "grad_norm": 8.327006340026855, + "kl": 0.22412109375, + "learning_rate": 7.006e-07, + "loss": 0.009, + "reward": 1.625, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4491 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.6875, + "epoch": 0.5989333333333333, + "grad_norm": 5.268330097198486, + "kl": 0.1650390625, + "learning_rate": 7.005333333333333e-07, + "loss": 0.0066, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4492 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.9375, + "epoch": 0.5990666666666666, + "grad_norm": 7.0482940673828125, + "kl": 0.16748046875, + "learning_rate": 7.004666666666666e-07, + "loss": 0.0067, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4493 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.1875, + "epoch": 0.5992, + "grad_norm": 0.3311052918434143, + "kl": 0.21240234375, + "learning_rate": 7.004e-07, + "loss": 0.0085, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4494 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.9375, + "epoch": 0.5993333333333334, + "grad_norm": 7.514556884765625, + "kl": 0.13330078125, + "learning_rate": 7.003333333333333e-07, + "loss": 0.0053, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4495 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.4375, + "epoch": 0.5994666666666667, + "grad_norm": 5.366645812988281, + "kl": 0.16845703125, + "learning_rate": 7.002666666666667e-07, + "loss": 0.0067, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4496 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.8125, + "epoch": 0.5996, + "grad_norm": 8.390454292297363, + "kl": 0.16943359375, + "learning_rate": 7.002e-07, + "loss": 0.0068, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4497 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.25, + "epoch": 0.5997333333333333, + "grad_norm": 9.221834182739258, + "kl": 0.19775390625, + "learning_rate": 7.001333333333334e-07, + "loss": 0.0079, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4498 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.125, + "epoch": 0.5998666666666667, + "grad_norm": 7.50528621673584, + "kl": 0.13134765625, + "learning_rate": 7.000666666666666e-07, + "loss": 0.0052, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4499 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.3125, + "epoch": 0.6, + "grad_norm": 8.911046981811523, + "kl": 0.193115234375, + "learning_rate": 7e-07, + "loss": 0.0077, + "reward": 1.625, + "reward_std": 0.6348394006490707, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 4500 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.5625, + "epoch": 0.6001333333333333, + "grad_norm": 7.313797950744629, + "kl": 0.18603515625, + "learning_rate": 6.999333333333333e-07, + "loss": 0.0074, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4501 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.9375, + "epoch": 0.6002666666666666, + "grad_norm": 7.916093349456787, + "kl": 0.140869140625, + "learning_rate": 6.998666666666666e-07, + "loss": 0.0056, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4502 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.375, + "epoch": 0.6004, + "grad_norm": 5.986486434936523, + "kl": 0.1669921875, + "learning_rate": 6.998e-07, + "loss": 0.0067, + "reward": 1.1875, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 4503 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.5625, + "epoch": 0.6005333333333334, + "grad_norm": 3.90671706199646, + "kl": 0.154296875, + "learning_rate": 6.997333333333332e-07, + "loss": 0.0062, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4504 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.4375, + "epoch": 0.6006666666666667, + "grad_norm": 4.777129173278809, + "kl": 0.172607421875, + "learning_rate": 6.996666666666666e-07, + "loss": 0.0069, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4505 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.625, + "epoch": 0.6008, + "grad_norm": 4.710752964019775, + "kl": 0.24169921875, + "learning_rate": 6.995999999999999e-07, + "loss": 0.0097, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4506 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.875, + "epoch": 0.6009333333333333, + "grad_norm": 6.371540069580078, + "kl": 0.19189453125, + "learning_rate": 6.995333333333333e-07, + "loss": 0.0077, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4507 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.6875, + "epoch": 0.6010666666666666, + "grad_norm": 91.13725280761719, + "kl": 0.18115234375, + "learning_rate": 6.994666666666666e-07, + "loss": 0.0072, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4508 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.0625, + "epoch": 0.6012, + "grad_norm": 5.048213481903076, + "kl": 0.3310546875, + "learning_rate": 6.994e-07, + "loss": 0.0133, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4509 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.5, + "epoch": 0.6013333333333334, + "grad_norm": 9.304121017456055, + "kl": 0.27734375, + "learning_rate": 6.993333333333333e-07, + "loss": 0.0111, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4510 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.375, + "epoch": 0.6014666666666667, + "grad_norm": 7.50678014755249, + "kl": 0.19775390625, + "learning_rate": 6.992666666666667e-07, + "loss": 0.0079, + "reward": 1.4375, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4511 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.75, + "epoch": 0.6016, + "grad_norm": 7.204689025878906, + "kl": 0.27685546875, + "learning_rate": 6.992e-07, + "loss": 0.0111, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4512 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.75, + "epoch": 0.6017333333333333, + "grad_norm": 7.3390421867370605, + "kl": 0.302734375, + "learning_rate": 6.991333333333333e-07, + "loss": 0.0121, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 4513 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.125, + "epoch": 0.6018666666666667, + "grad_norm": 8.045222282409668, + "kl": 0.296875, + "learning_rate": 6.990666666666666e-07, + "loss": 0.0119, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4514 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.0625, + "epoch": 0.602, + "grad_norm": 7.3331217765808105, + "kl": 0.13427734375, + "learning_rate": 6.989999999999999e-07, + "loss": 0.0054, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4515 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.625, + "epoch": 0.6021333333333333, + "grad_norm": 7.820693016052246, + "kl": 0.134765625, + "learning_rate": 6.989333333333333e-07, + "loss": 0.0054, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4516 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.9375, + "epoch": 0.6022666666666666, + "grad_norm": 5.227692604064941, + "kl": 0.1611328125, + "learning_rate": 6.988666666666666e-07, + "loss": 0.0064, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4517 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.0625, + "epoch": 0.6024, + "grad_norm": 6.0008158683776855, + "kl": 0.16064453125, + "learning_rate": 6.988e-07, + "loss": 0.0064, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4518 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.375, + "epoch": 0.6025333333333334, + "grad_norm": 5.469173431396484, + "kl": 0.22314453125, + "learning_rate": 6.987333333333333e-07, + "loss": 0.0089, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4519 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.25, + "epoch": 0.6026666666666667, + "grad_norm": 11.897677421569824, + "kl": 0.1484375, + "learning_rate": 6.986666666666667e-07, + "loss": 0.0059, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 4520 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.6875, + "epoch": 0.6028, + "grad_norm": 7.173709392547607, + "kl": 0.169921875, + "learning_rate": 6.986e-07, + "loss": 0.0068, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 4521 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.125, + "epoch": 0.6029333333333333, + "grad_norm": 9.731943130493164, + "kl": 0.19189453125, + "learning_rate": 6.985333333333333e-07, + "loss": 0.0077, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4522 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.625, + "epoch": 0.6030666666666666, + "grad_norm": 12.979230880737305, + "kl": 0.7294921875, + "learning_rate": 6.984666666666667e-07, + "loss": 0.0292, + "reward": 1.5625, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 4523 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.5, + "epoch": 0.6032, + "grad_norm": 7.348535060882568, + "kl": 0.16357421875, + "learning_rate": 6.984e-07, + "loss": 0.0065, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4524 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.375, + "epoch": 0.6033333333333334, + "grad_norm": 5.904820919036865, + "kl": 0.13232421875, + "learning_rate": 6.983333333333334e-07, + "loss": 0.0053, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4525 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.4375, + "epoch": 0.6034666666666667, + "grad_norm": 7.40321683883667, + "kl": 0.13623046875, + "learning_rate": 6.982666666666666e-07, + "loss": 0.0055, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4526 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.375, + "epoch": 0.6036, + "grad_norm": 6.30137300491333, + "kl": 0.122802734375, + "learning_rate": 6.982e-07, + "loss": 0.0049, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4527 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.9375, + "epoch": 0.6037333333333333, + "grad_norm": 5.660009384155273, + "kl": 0.151123046875, + "learning_rate": 6.981333333333333e-07, + "loss": 0.006, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4528 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.4375, + "epoch": 0.6038666666666667, + "grad_norm": 6.141552925109863, + "kl": 0.2705078125, + "learning_rate": 6.980666666666666e-07, + "loss": 0.0108, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4529 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.375, + "epoch": 0.604, + "grad_norm": 4.731907844543457, + "kl": 0.173828125, + "learning_rate": 6.979999999999999e-07, + "loss": 0.0069, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4530 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.25, + "epoch": 0.6041333333333333, + "grad_norm": 0.3787926435470581, + "kl": 0.18212890625, + "learning_rate": 6.979333333333332e-07, + "loss": 0.0073, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4531 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.6875, + "epoch": 0.6042666666666666, + "grad_norm": 4.791690349578857, + "kl": 0.20166015625, + "learning_rate": 6.978666666666666e-07, + "loss": 0.0081, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4532 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.875, + "epoch": 0.6044, + "grad_norm": 7.602738857269287, + "kl": 0.18701171875, + "learning_rate": 6.977999999999999e-07, + "loss": 0.0075, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4533 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.4375, + "epoch": 0.6045333333333334, + "grad_norm": 7.367252349853516, + "kl": 0.1962890625, + "learning_rate": 6.977333333333333e-07, + "loss": 0.0079, + "reward": 1.375, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4534 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.5625, + "epoch": 0.6046666666666667, + "grad_norm": 7.934572696685791, + "kl": 0.148681640625, + "learning_rate": 6.976666666666666e-07, + "loss": 0.0059, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4535 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.25, + "epoch": 0.6048, + "grad_norm": 6.538253307342529, + "kl": 0.18310546875, + "learning_rate": 6.976e-07, + "loss": 0.0073, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4536 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.25, + "epoch": 0.6049333333333333, + "grad_norm": 6.9113593101501465, + "kl": 0.141357421875, + "learning_rate": 6.975333333333333e-07, + "loss": 0.0057, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4537 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.3125, + "epoch": 0.6050666666666666, + "grad_norm": 3.1726491451263428, + "kl": 0.12548828125, + "learning_rate": 6.974666666666667e-07, + "loss": 0.005, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 4538 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.0, + "epoch": 0.6052, + "grad_norm": 9.607725143432617, + "kl": 0.18896484375, + "learning_rate": 6.974e-07, + "loss": 0.0075, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4539 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.0625, + "epoch": 0.6053333333333333, + "grad_norm": 6.534381866455078, + "kl": 0.119140625, + "learning_rate": 6.973333333333333e-07, + "loss": 0.0048, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4540 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.1875, + "epoch": 0.6054666666666667, + "grad_norm": 8.531315803527832, + "kl": 0.2099609375, + "learning_rate": 6.972666666666667e-07, + "loss": 0.0084, + "reward": 1.375, + "reward_std": 0.579209566116333, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4541 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.75, + "epoch": 0.6056, + "grad_norm": 9.288471221923828, + "kl": 0.30859375, + "learning_rate": 6.972e-07, + "loss": 0.0123, + "reward": 1.3125, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 4542 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.4375, + "epoch": 0.6057333333333333, + "grad_norm": 83.45013427734375, + "kl": 0.26025390625, + "learning_rate": 6.971333333333334e-07, + "loss": 0.0104, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4543 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.0, + "epoch": 0.6058666666666667, + "grad_norm": 21.219072341918945, + "kl": 0.14404296875, + "learning_rate": 6.970666666666666e-07, + "loss": 0.0058, + "reward": 1.5, + "reward_std": 0.8300746083259583, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.8125, + "step": 4544 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.125, + "epoch": 0.606, + "grad_norm": 7.948986053466797, + "kl": 0.1669921875, + "learning_rate": 6.97e-07, + "loss": 0.0067, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4545 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.4375, + "epoch": 0.6061333333333333, + "grad_norm": 4.062943935394287, + "kl": 0.1240234375, + "learning_rate": 6.969333333333332e-07, + "loss": 0.005, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4546 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.75, + "epoch": 0.6062666666666666, + "grad_norm": 6.315509796142578, + "kl": 0.16943359375, + "learning_rate": 6.968666666666666e-07, + "loss": 0.0068, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4547 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.3125, + "epoch": 0.6064, + "grad_norm": 11.17106819152832, + "kl": 0.14453125, + "learning_rate": 6.967999999999999e-07, + "loss": 0.0058, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4548 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.875, + "epoch": 0.6065333333333334, + "grad_norm": 7.025022029876709, + "kl": 0.19970703125, + "learning_rate": 6.967333333333332e-07, + "loss": 0.008, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4549 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.0625, + "epoch": 0.6066666666666667, + "grad_norm": 7.61632776260376, + "kl": 0.1484375, + "learning_rate": 6.966666666666666e-07, + "loss": 0.0059, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 4550 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.5, + "epoch": 0.6068, + "grad_norm": 6.915498733520508, + "kl": 0.29345703125, + "learning_rate": 6.965999999999999e-07, + "loss": 0.0118, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 4551 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.75, + "epoch": 0.6069333333333333, + "grad_norm": 5.070740699768066, + "kl": 0.15380859375, + "learning_rate": 6.965333333333333e-07, + "loss": 0.0062, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4552 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.875, + "epoch": 0.6070666666666666, + "grad_norm": 8.040190696716309, + "kl": 0.118896484375, + "learning_rate": 6.964666666666666e-07, + "loss": 0.0048, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 4553 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.3125, + "epoch": 0.6072, + "grad_norm": 6.885378360748291, + "kl": 0.158447265625, + "learning_rate": 6.964e-07, + "loss": 0.0063, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4554 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.125, + "epoch": 0.6073333333333333, + "grad_norm": 5.8832902908325195, + "kl": 0.14013671875, + "learning_rate": 6.963333333333333e-07, + "loss": 0.0056, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4555 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.3125, + "epoch": 0.6074666666666667, + "grad_norm": 8.87314510345459, + "kl": 0.1650390625, + "learning_rate": 6.962666666666667e-07, + "loss": 0.0066, + "reward": 1.4375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4556 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.9375, + "epoch": 0.6076, + "grad_norm": 7.640439510345459, + "kl": 0.21826171875, + "learning_rate": 6.962e-07, + "loss": 0.0088, + "reward": 1.5625, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 4557 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.875, + "epoch": 0.6077333333333333, + "grad_norm": 4.442895412445068, + "kl": 0.1455078125, + "learning_rate": 6.961333333333334e-07, + "loss": 0.0058, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4558 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.0, + "epoch": 0.6078666666666667, + "grad_norm": 0.3217470347881317, + "kl": 0.16259765625, + "learning_rate": 6.960666666666666e-07, + "loss": 0.0065, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4559 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.5625, + "epoch": 0.608, + "grad_norm": 3.8591363430023193, + "kl": 0.14208984375, + "learning_rate": 6.959999999999999e-07, + "loss": 0.0057, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4560 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.0625, + "epoch": 0.6081333333333333, + "grad_norm": 2.9813408851623535, + "kl": 0.1083984375, + "learning_rate": 6.959333333333333e-07, + "loss": 0.0043, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4561 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.1875, + "epoch": 0.6082666666666666, + "grad_norm": 7.200015544891357, + "kl": 0.224609375, + "learning_rate": 6.958666666666666e-07, + "loss": 0.009, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4562 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.75, + "epoch": 0.6084, + "grad_norm": 5.230540752410889, + "kl": 0.23193359375, + "learning_rate": 6.958e-07, + "loss": 0.0093, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4563 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.5, + "epoch": 0.6085333333333334, + "grad_norm": 7.29752254486084, + "kl": 0.193359375, + "learning_rate": 6.957333333333333e-07, + "loss": 0.0077, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4564 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.5, + "epoch": 0.6086666666666667, + "grad_norm": 10.817329406738281, + "kl": 0.2998046875, + "learning_rate": 6.956666666666667e-07, + "loss": 0.012, + "reward": 1.125, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.875, + "step": 4565 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.1875, + "epoch": 0.6088, + "grad_norm": 4.850337505340576, + "kl": 0.137939453125, + "learning_rate": 6.956e-07, + "loss": 0.0055, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4566 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.375, + "epoch": 0.6089333333333333, + "grad_norm": 4.253921985626221, + "kl": 0.17578125, + "learning_rate": 6.955333333333334e-07, + "loss": 0.007, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4567 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.4375, + "epoch": 0.6090666666666666, + "grad_norm": 6.800792694091797, + "kl": 0.1611328125, + "learning_rate": 6.954666666666666e-07, + "loss": 0.0064, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4568 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.25, + "epoch": 0.6092, + "grad_norm": 0.28911292552948, + "kl": 0.16455078125, + "learning_rate": 6.953999999999999e-07, + "loss": 0.0066, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4569 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.3125, + "epoch": 0.6093333333333333, + "grad_norm": 8.169244766235352, + "kl": 0.205078125, + "learning_rate": 6.953333333333333e-07, + "loss": 0.0082, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4570 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.8125, + "epoch": 0.6094666666666667, + "grad_norm": 8.084566116333008, + "kl": 0.1962890625, + "learning_rate": 6.952666666666666e-07, + "loss": 0.0078, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4571 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.5, + "epoch": 0.6096, + "grad_norm": 6.634510517120361, + "kl": 0.151123046875, + "learning_rate": 6.952e-07, + "loss": 0.006, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4572 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.5625, + "epoch": 0.6097333333333333, + "grad_norm": 8.360814094543457, + "kl": 0.21533203125, + "learning_rate": 6.951333333333333e-07, + "loss": 0.0086, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4573 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.75, + "epoch": 0.6098666666666667, + "grad_norm": 5.227766513824463, + "kl": 0.140625, + "learning_rate": 6.950666666666667e-07, + "loss": 0.0056, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 4574 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.0, + "epoch": 0.61, + "grad_norm": 5.875813961029053, + "kl": 0.3017578125, + "learning_rate": 6.949999999999999e-07, + "loss": 0.0121, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.8125, + "step": 4575 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.75, + "epoch": 0.6101333333333333, + "grad_norm": 9.122875213623047, + "kl": 0.23486328125, + "learning_rate": 6.949333333333333e-07, + "loss": 0.0094, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4576 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.5, + "epoch": 0.6102666666666666, + "grad_norm": 7.7745280265808105, + "kl": 0.2333984375, + "learning_rate": 6.948666666666666e-07, + "loss": 0.0093, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4577 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.125, + "epoch": 0.6104, + "grad_norm": 6.861470699310303, + "kl": 0.2099609375, + "learning_rate": 6.947999999999999e-07, + "loss": 0.0084, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4578 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.25, + "epoch": 0.6105333333333334, + "grad_norm": 7.195506572723389, + "kl": 0.17138671875, + "learning_rate": 6.947333333333333e-07, + "loss": 0.0068, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4579 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.75, + "epoch": 0.6106666666666667, + "grad_norm": 265.02783203125, + "kl": 0.15283203125, + "learning_rate": 6.946666666666666e-07, + "loss": 0.0061, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 4580 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.6875, + "epoch": 0.6108, + "grad_norm": 9.229700088500977, + "kl": 0.22021484375, + "learning_rate": 6.946e-07, + "loss": 0.0088, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4581 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.75, + "epoch": 0.6109333333333333, + "grad_norm": 4.168158531188965, + "kl": 0.1689453125, + "learning_rate": 6.945333333333333e-07, + "loss": 0.0068, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4582 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.5, + "epoch": 0.6110666666666666, + "grad_norm": 4.901018142700195, + "kl": 0.298828125, + "learning_rate": 6.944666666666667e-07, + "loss": 0.0119, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4583 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.875, + "epoch": 0.6112, + "grad_norm": 0.5387855768203735, + "kl": 0.21337890625, + "learning_rate": 6.944e-07, + "loss": 0.0085, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 4584 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.875, + "epoch": 0.6113333333333333, + "grad_norm": 6.921317100524902, + "kl": 0.145751953125, + "learning_rate": 6.943333333333334e-07, + "loss": 0.0058, + "reward": 1.4375, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4585 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.1875, + "epoch": 0.6114666666666667, + "grad_norm": 4.525078296661377, + "kl": 0.134033203125, + "learning_rate": 6.942666666666667e-07, + "loss": 0.0054, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 4586 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.8125, + "epoch": 0.6116, + "grad_norm": 33.76683807373047, + "kl": 0.1826171875, + "learning_rate": 6.942e-07, + "loss": 0.0073, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4587 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.5625, + "epoch": 0.6117333333333334, + "grad_norm": 8.037758827209473, + "kl": 0.26611328125, + "learning_rate": 6.941333333333334e-07, + "loss": 0.0107, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4588 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.5, + "epoch": 0.6118666666666667, + "grad_norm": 5.7257795333862305, + "kl": 0.3759765625, + "learning_rate": 6.940666666666666e-07, + "loss": 0.015, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4589 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.6875, + "epoch": 0.612, + "grad_norm": 5.202568531036377, + "kl": 0.16748046875, + "learning_rate": 6.939999999999999e-07, + "loss": 0.0067, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 4590 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.6875, + "epoch": 0.6121333333333333, + "grad_norm": 5.094775199890137, + "kl": 0.1796875, + "learning_rate": 6.939333333333332e-07, + "loss": 0.0072, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4591 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.375, + "epoch": 0.6122666666666666, + "grad_norm": 11.08853530883789, + "kl": 0.23095703125, + "learning_rate": 6.938666666666666e-07, + "loss": 0.0092, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4592 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.1875, + "epoch": 0.6124, + "grad_norm": 4.941312313079834, + "kl": 0.21484375, + "learning_rate": 6.937999999999999e-07, + "loss": 0.0086, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4593 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.0625, + "epoch": 0.6125333333333334, + "grad_norm": 9.912696838378906, + "kl": 0.22998046875, + "learning_rate": 6.937333333333333e-07, + "loss": 0.0092, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4594 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.1875, + "epoch": 0.6126666666666667, + "grad_norm": 7.559901714324951, + "kl": 0.16162109375, + "learning_rate": 6.936666666666666e-07, + "loss": 0.0065, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4595 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.9375, + "epoch": 0.6128, + "grad_norm": 4.640647888183594, + "kl": 0.16552734375, + "learning_rate": 6.935999999999999e-07, + "loss": 0.0066, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 4596 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.375, + "epoch": 0.6129333333333333, + "grad_norm": 6.02608060836792, + "kl": 0.13818359375, + "learning_rate": 6.935333333333333e-07, + "loss": 0.0055, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4597 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.8125, + "epoch": 0.6130666666666666, + "grad_norm": 6.346468448638916, + "kl": 0.21728515625, + "learning_rate": 6.934666666666666e-07, + "loss": 0.0087, + "reward": 1.1875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 4598 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.4375, + "epoch": 0.6132, + "grad_norm": 7.774438858032227, + "kl": 0.2373046875, + "learning_rate": 6.934e-07, + "loss": 0.0095, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 4599 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.8125, + "epoch": 0.6133333333333333, + "grad_norm": 6.900032043457031, + "kl": 0.1904296875, + "learning_rate": 6.933333333333333e-07, + "loss": 0.0076, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 4600 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.375, + "epoch": 0.6134666666666667, + "grad_norm": 6.818897247314453, + "kl": 0.21435546875, + "learning_rate": 6.932666666666667e-07, + "loss": 0.0086, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4601 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.1875, + "epoch": 0.6136, + "grad_norm": 69.1631851196289, + "kl": 0.2646484375, + "learning_rate": 6.932e-07, + "loss": 0.0106, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4602 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.5625, + "epoch": 0.6137333333333334, + "grad_norm": 5.156216144561768, + "kl": 0.1923828125, + "learning_rate": 6.931333333333334e-07, + "loss": 0.0077, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4603 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.875, + "epoch": 0.6138666666666667, + "grad_norm": 7.84705114364624, + "kl": 0.197265625, + "learning_rate": 6.930666666666667e-07, + "loss": 0.0079, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4604 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.1875, + "epoch": 0.614, + "grad_norm": 0.36549195647239685, + "kl": 0.21728515625, + "learning_rate": 6.929999999999999e-07, + "loss": 0.0087, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4605 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.5625, + "epoch": 0.6141333333333333, + "grad_norm": 7.644419193267822, + "kl": 0.28955078125, + "learning_rate": 6.929333333333333e-07, + "loss": 0.0116, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4606 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.1875, + "epoch": 0.6142666666666666, + "grad_norm": 7.893495559692383, + "kl": 0.1484375, + "learning_rate": 6.928666666666666e-07, + "loss": 0.0059, + "reward": 1.375, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4607 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.625, + "epoch": 0.6144, + "grad_norm": 31.28521728515625, + "kl": 0.26904296875, + "learning_rate": 6.928e-07, + "loss": 0.0108, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4608 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.5625, + "epoch": 0.6145333333333334, + "grad_norm": 7.186730861663818, + "kl": 0.2109375, + "learning_rate": 6.927333333333332e-07, + "loss": 0.0084, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4609 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.625, + "epoch": 0.6146666666666667, + "grad_norm": 2.248054027557373, + "kl": 0.2119140625, + "learning_rate": 6.926666666666666e-07, + "loss": 0.0085, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 4610 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.5625, + "epoch": 0.6148, + "grad_norm": 10.420830726623535, + "kl": 0.2939453125, + "learning_rate": 6.925999999999999e-07, + "loss": 0.0117, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4611 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.1875, + "epoch": 0.6149333333333333, + "grad_norm": 0.41278794407844543, + "kl": 0.26708984375, + "learning_rate": 6.925333333333333e-07, + "loss": 0.0107, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4612 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.5, + "epoch": 0.6150666666666667, + "grad_norm": 1.5647664070129395, + "kl": 0.3095703125, + "learning_rate": 6.924666666666666e-07, + "loss": 0.0123, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4613 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.0, + "epoch": 0.6152, + "grad_norm": 8.243640899658203, + "kl": 0.1826171875, + "learning_rate": 6.924e-07, + "loss": 0.0073, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4614 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.75, + "epoch": 0.6153333333333333, + "grad_norm": 7.590167045593262, + "kl": 0.166015625, + "learning_rate": 6.923333333333333e-07, + "loss": 0.0066, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4615 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.375, + "epoch": 0.6154666666666667, + "grad_norm": 8.344541549682617, + "kl": 0.17041015625, + "learning_rate": 6.922666666666666e-07, + "loss": 0.0068, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4616 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.5, + "epoch": 0.6156, + "grad_norm": 9.990907669067383, + "kl": 0.3037109375, + "learning_rate": 6.922e-07, + "loss": 0.0121, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4617 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.125, + "epoch": 0.6157333333333334, + "grad_norm": 5.904227256774902, + "kl": 0.1982421875, + "learning_rate": 6.921333333333333e-07, + "loss": 0.0079, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4618 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.25, + "epoch": 0.6158666666666667, + "grad_norm": 6.39783239364624, + "kl": 0.1279296875, + "learning_rate": 6.920666666666667e-07, + "loss": 0.0051, + "reward": 1.4375, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4619 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.125, + "epoch": 0.616, + "grad_norm": 9.324182510375977, + "kl": 0.27978515625, + "learning_rate": 6.919999999999999e-07, + "loss": 0.0112, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4620 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.125, + "epoch": 0.6161333333333333, + "grad_norm": 5.431902885437012, + "kl": 0.20068359375, + "learning_rate": 6.919333333333333e-07, + "loss": 0.008, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4621 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.25, + "epoch": 0.6162666666666666, + "grad_norm": 3.876974105834961, + "kl": 0.23974609375, + "learning_rate": 6.918666666666666e-07, + "loss": 0.0096, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4622 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.5, + "epoch": 0.6164, + "grad_norm": 4.019659996032715, + "kl": 0.16015625, + "learning_rate": 6.918e-07, + "loss": 0.0064, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4623 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.1875, + "epoch": 0.6165333333333334, + "grad_norm": 5.530038356781006, + "kl": 0.28466796875, + "learning_rate": 6.917333333333333e-07, + "loss": 0.0114, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 4624 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.8125, + "epoch": 0.6166666666666667, + "grad_norm": 8.272143363952637, + "kl": 0.1806640625, + "learning_rate": 6.916666666666666e-07, + "loss": 0.0072, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4625 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.9375, + "epoch": 0.6168, + "grad_norm": 7.151002407073975, + "kl": 0.20556640625, + "learning_rate": 6.916e-07, + "loss": 0.0082, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4626 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.0, + "epoch": 0.6169333333333333, + "grad_norm": 0.5280484557151794, + "kl": 0.2236328125, + "learning_rate": 6.915333333333333e-07, + "loss": 0.009, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4627 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.4375, + "epoch": 0.6170666666666667, + "grad_norm": 3.4439635276794434, + "kl": 0.26806640625, + "learning_rate": 6.914666666666667e-07, + "loss": 0.0107, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4628 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.9375, + "epoch": 0.6172, + "grad_norm": 3.7112796306610107, + "kl": 0.2978515625, + "learning_rate": 6.914e-07, + "loss": 0.0119, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 4629 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.0625, + "epoch": 0.6173333333333333, + "grad_norm": 3.243438482284546, + "kl": 0.12841796875, + "learning_rate": 6.913333333333334e-07, + "loss": 0.0051, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4630 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.0, + "epoch": 0.6174666666666667, + "grad_norm": 6.606992721557617, + "kl": 0.2880859375, + "learning_rate": 6.912666666666666e-07, + "loss": 0.0115, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4631 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.75, + "epoch": 0.6176, + "grad_norm": 13.945500373840332, + "kl": 0.22900390625, + "learning_rate": 6.912e-07, + "loss": 0.0092, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 4632 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.0625, + "epoch": 0.6177333333333334, + "grad_norm": 6.162720203399658, + "kl": 0.21337890625, + "learning_rate": 6.911333333333333e-07, + "loss": 0.0085, + "reward": 1.25, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 4633 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.0625, + "epoch": 0.6178666666666667, + "grad_norm": 4.516103267669678, + "kl": 0.23828125, + "learning_rate": 6.910666666666666e-07, + "loss": 0.0095, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4634 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.1875, + "epoch": 0.618, + "grad_norm": 7.3195905685424805, + "kl": 0.118896484375, + "learning_rate": 6.909999999999999e-07, + "loss": 0.0047, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4635 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.625, + "epoch": 0.6181333333333333, + "grad_norm": 8.339380264282227, + "kl": 0.2470703125, + "learning_rate": 6.909333333333332e-07, + "loss": 0.0099, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4636 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.875, + "epoch": 0.6182666666666666, + "grad_norm": 6.27717924118042, + "kl": 0.24169921875, + "learning_rate": 6.908666666666666e-07, + "loss": 0.0097, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 4637 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.5, + "epoch": 0.6184, + "grad_norm": 5.083431243896484, + "kl": 0.14404296875, + "learning_rate": 6.907999999999999e-07, + "loss": 0.0058, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4638 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.375, + "epoch": 0.6185333333333334, + "grad_norm": 5.33137321472168, + "kl": 0.2744140625, + "learning_rate": 6.907333333333333e-07, + "loss": 0.011, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4639 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.5, + "epoch": 0.6186666666666667, + "grad_norm": 6.0630998611450195, + "kl": 0.19482421875, + "learning_rate": 6.906666666666666e-07, + "loss": 0.0078, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4640 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.75, + "epoch": 0.6188, + "grad_norm": 7.391509532928467, + "kl": 0.1962890625, + "learning_rate": 6.906e-07, + "loss": 0.0079, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4641 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.0625, + "epoch": 0.6189333333333333, + "grad_norm": 6.059316635131836, + "kl": 0.3212890625, + "learning_rate": 6.905333333333333e-07, + "loss": 0.0128, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4642 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.375, + "epoch": 0.6190666666666667, + "grad_norm": 8.788153648376465, + "kl": 0.21923828125, + "learning_rate": 6.904666666666666e-07, + "loss": 0.0088, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4643 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.25, + "epoch": 0.6192, + "grad_norm": 9.537628173828125, + "kl": 0.20263671875, + "learning_rate": 6.904e-07, + "loss": 0.0081, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4644 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.875, + "epoch": 0.6193333333333333, + "grad_norm": 0.40971559286117554, + "kl": 0.18359375, + "learning_rate": 6.903333333333333e-07, + "loss": 0.0074, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4645 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.0625, + "epoch": 0.6194666666666667, + "grad_norm": 5.572992324829102, + "kl": 0.22314453125, + "learning_rate": 6.902666666666667e-07, + "loss": 0.0089, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4646 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.25, + "epoch": 0.6196, + "grad_norm": 6.26569128036499, + "kl": 0.20458984375, + "learning_rate": 6.902e-07, + "loss": 0.0082, + "reward": 1.3125, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 4647 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.3125, + "epoch": 0.6197333333333334, + "grad_norm": 0.5060293674468994, + "kl": 0.294921875, + "learning_rate": 6.901333333333334e-07, + "loss": 0.0118, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4648 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.625, + "epoch": 0.6198666666666667, + "grad_norm": 6.9178667068481445, + "kl": 0.1640625, + "learning_rate": 6.900666666666667e-07, + "loss": 0.0066, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4649 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.9375, + "epoch": 0.62, + "grad_norm": 15.853049278259277, + "kl": 0.19921875, + "learning_rate": 6.9e-07, + "loss": 0.008, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4650 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.25, + "epoch": 0.6201333333333333, + "grad_norm": 5.2798895835876465, + "kl": 0.1591796875, + "learning_rate": 6.899333333333333e-07, + "loss": 0.0064, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4651 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.8125, + "epoch": 0.6202666666666666, + "grad_norm": 6.367636680603027, + "kl": 0.17041015625, + "learning_rate": 6.898666666666665e-07, + "loss": 0.0068, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4652 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.0, + "epoch": 0.6204, + "grad_norm": 9.409278869628906, + "kl": 0.22900390625, + "learning_rate": 6.897999999999999e-07, + "loss": 0.0092, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4653 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.375, + "epoch": 0.6205333333333334, + "grad_norm": 8.0834321975708, + "kl": 0.185546875, + "learning_rate": 6.897333333333332e-07, + "loss": 0.0074, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4654 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.625, + "epoch": 0.6206666666666667, + "grad_norm": 7.620742321014404, + "kl": 0.18505859375, + "learning_rate": 6.896666666666666e-07, + "loss": 0.0074, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4655 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.0625, + "epoch": 0.6208, + "grad_norm": 6.421372413635254, + "kl": 0.1767578125, + "learning_rate": 6.895999999999999e-07, + "loss": 0.0071, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4656 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.5625, + "epoch": 0.6209333333333333, + "grad_norm": 22.347719192504883, + "kl": 0.191162109375, + "learning_rate": 6.895333333333333e-07, + "loss": 0.0077, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4657 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.625, + "epoch": 0.6210666666666667, + "grad_norm": 21.550596237182617, + "kl": 0.1533203125, + "learning_rate": 6.894666666666666e-07, + "loss": 0.0061, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4658 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.3125, + "epoch": 0.6212, + "grad_norm": 7.321437358856201, + "kl": 0.1494140625, + "learning_rate": 6.894e-07, + "loss": 0.006, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4659 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.25, + "epoch": 0.6213333333333333, + "grad_norm": 2.969221353530884, + "kl": 0.1357421875, + "learning_rate": 6.893333333333333e-07, + "loss": 0.0054, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 4660 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.0625, + "epoch": 0.6214666666666666, + "grad_norm": 5.202236175537109, + "kl": 0.177734375, + "learning_rate": 6.892666666666667e-07, + "loss": 0.0071, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4661 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.375, + "epoch": 0.6216, + "grad_norm": 5.956733226776123, + "kl": 0.1748046875, + "learning_rate": 6.892e-07, + "loss": 0.007, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4662 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.0, + "epoch": 0.6217333333333334, + "grad_norm": 0.5072014927864075, + "kl": 0.173828125, + "learning_rate": 6.891333333333333e-07, + "loss": 0.007, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4663 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.5625, + "epoch": 0.6218666666666667, + "grad_norm": 3.692514657974243, + "kl": 0.25927734375, + "learning_rate": 6.890666666666667e-07, + "loss": 0.0103, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4664 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.8125, + "epoch": 0.622, + "grad_norm": 7.530259132385254, + "kl": 0.35546875, + "learning_rate": 6.889999999999999e-07, + "loss": 0.0142, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4665 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.0, + "epoch": 0.6221333333333333, + "grad_norm": 6.318759441375732, + "kl": 0.20947265625, + "learning_rate": 6.889333333333333e-07, + "loss": 0.0084, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 4666 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.4375, + "epoch": 0.6222666666666666, + "grad_norm": 8.969366073608398, + "kl": 0.33935546875, + "learning_rate": 6.888666666666666e-07, + "loss": 0.0136, + "reward": 1.125, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 4667 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.0, + "epoch": 0.6224, + "grad_norm": 6.255199432373047, + "kl": 0.1943359375, + "learning_rate": 6.888e-07, + "loss": 0.0078, + "reward": 1.4375, + "reward_std": 0.8349219560623169, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.8125, + "step": 4668 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.75, + "epoch": 0.6225333333333334, + "grad_norm": 7.58127498626709, + "kl": 0.2890625, + "learning_rate": 6.887333333333333e-07, + "loss": 0.0116, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4669 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.375, + "epoch": 0.6226666666666667, + "grad_norm": 7.133263111114502, + "kl": 0.20166015625, + "learning_rate": 6.886666666666667e-07, + "loss": 0.0081, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4670 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.125, + "epoch": 0.6228, + "grad_norm": 4.502405166625977, + "kl": 0.21826171875, + "learning_rate": 6.886e-07, + "loss": 0.0087, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4671 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.1875, + "epoch": 0.6229333333333333, + "grad_norm": 5.4837646484375, + "kl": 0.19970703125, + "learning_rate": 6.885333333333333e-07, + "loss": 0.008, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4672 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.0, + "epoch": 0.6230666666666667, + "grad_norm": 11.700883865356445, + "kl": 0.24609375, + "learning_rate": 6.884666666666667e-07, + "loss": 0.0098, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4673 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.625, + "epoch": 0.6232, + "grad_norm": 6.869685173034668, + "kl": 0.21826171875, + "learning_rate": 6.883999999999999e-07, + "loss": 0.0087, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4674 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.4375, + "epoch": 0.6233333333333333, + "grad_norm": 6.992360591888428, + "kl": 0.177490234375, + "learning_rate": 6.883333333333333e-07, + "loss": 0.0071, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4675 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.6875, + "epoch": 0.6234666666666666, + "grad_norm": 5.689438819885254, + "kl": 0.251953125, + "learning_rate": 6.882666666666666e-07, + "loss": 0.0101, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4676 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.4375, + "epoch": 0.6236, + "grad_norm": 13.660913467407227, + "kl": 0.15771484375, + "learning_rate": 6.882e-07, + "loss": 0.0063, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4677 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.5, + "epoch": 0.6237333333333334, + "grad_norm": 7.49103307723999, + "kl": 0.2763671875, + "learning_rate": 6.881333333333333e-07, + "loss": 0.0111, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4678 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.5625, + "epoch": 0.6238666666666667, + "grad_norm": 6.826605796813965, + "kl": 0.20703125, + "learning_rate": 6.880666666666667e-07, + "loss": 0.0083, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 4679 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.375, + "epoch": 0.624, + "grad_norm": 51.03379821777344, + "kl": 0.21923828125, + "learning_rate": 6.879999999999999e-07, + "loss": 0.0088, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 4680 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.0625, + "epoch": 0.6241333333333333, + "grad_norm": 9.842926025390625, + "kl": 0.2578125, + "learning_rate": 6.879333333333332e-07, + "loss": 0.0103, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4681 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.125, + "epoch": 0.6242666666666666, + "grad_norm": 5.973811626434326, + "kl": 0.20361328125, + "learning_rate": 6.878666666666666e-07, + "loss": 0.0081, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4682 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.25, + "epoch": 0.6244, + "grad_norm": 6.3260064125061035, + "kl": 0.17919921875, + "learning_rate": 6.877999999999999e-07, + "loss": 0.0072, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4683 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.3125, + "epoch": 0.6245333333333334, + "grad_norm": 6.4416728019714355, + "kl": 0.15087890625, + "learning_rate": 6.877333333333333e-07, + "loss": 0.006, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4684 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.125, + "epoch": 0.6246666666666667, + "grad_norm": 20.48126792907715, + "kl": 0.2763671875, + "learning_rate": 6.876666666666666e-07, + "loss": 0.011, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4685 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.0625, + "epoch": 0.6248, + "grad_norm": 8.52875804901123, + "kl": 0.19921875, + "learning_rate": 6.876e-07, + "loss": 0.008, + "reward": 1.625, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4686 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.3125, + "epoch": 0.6249333333333333, + "grad_norm": 5.655427932739258, + "kl": 0.16552734375, + "learning_rate": 6.875333333333333e-07, + "loss": 0.0066, + "reward": 1.375, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4687 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.5625, + "epoch": 0.6250666666666667, + "grad_norm": 5.078852653503418, + "kl": 0.23876953125, + "learning_rate": 6.874666666666667e-07, + "loss": 0.0095, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4688 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.75, + "epoch": 0.6252, + "grad_norm": 7.2970685958862305, + "kl": 0.2109375, + "learning_rate": 6.874e-07, + "loss": 0.0084, + "reward": 1.6875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 4689 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.4375, + "epoch": 0.6253333333333333, + "grad_norm": 5.5390753746032715, + "kl": 0.19677734375, + "learning_rate": 6.873333333333334e-07, + "loss": 0.0079, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4690 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.25, + "epoch": 0.6254666666666666, + "grad_norm": 4.963134765625, + "kl": 0.17333984375, + "learning_rate": 6.872666666666667e-07, + "loss": 0.0069, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 4691 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.375, + "epoch": 0.6256, + "grad_norm": 9.5831298828125, + "kl": 0.2255859375, + "learning_rate": 6.872e-07, + "loss": 0.009, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4692 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.0, + "epoch": 0.6257333333333334, + "grad_norm": 4.512818813323975, + "kl": 0.275390625, + "learning_rate": 6.871333333333334e-07, + "loss": 0.011, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4693 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.25, + "epoch": 0.6258666666666667, + "grad_norm": 0.2710152566432953, + "kl": 0.18798828125, + "learning_rate": 6.870666666666667e-07, + "loss": 0.0075, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4694 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.5, + "epoch": 0.626, + "grad_norm": 9.20253849029541, + "kl": 0.2958984375, + "learning_rate": 6.87e-07, + "loss": 0.0118, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 4695 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.5, + "epoch": 0.6261333333333333, + "grad_norm": 6.735353946685791, + "kl": 0.15234375, + "learning_rate": 6.869333333333332e-07, + "loss": 0.0061, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 4696 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.4375, + "epoch": 0.6262666666666666, + "grad_norm": 9.040797233581543, + "kl": 0.2216796875, + "learning_rate": 6.868666666666666e-07, + "loss": 0.0089, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4697 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.6875, + "epoch": 0.6264, + "grad_norm": 10.09296703338623, + "kl": 0.3955078125, + "learning_rate": 6.867999999999999e-07, + "loss": 0.0158, + "reward": 1.0, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 4698 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.0625, + "epoch": 0.6265333333333334, + "grad_norm": 5.738481521606445, + "kl": 0.2900390625, + "learning_rate": 6.867333333333333e-07, + "loss": 0.0116, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 4699 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.0625, + "epoch": 0.6266666666666667, + "grad_norm": 9.951530456542969, + "kl": 0.2783203125, + "learning_rate": 6.866666666666666e-07, + "loss": 0.0111, + "reward": 1.625, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4700 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.125, + "epoch": 0.6268, + "grad_norm": 5.639536380767822, + "kl": 0.1650390625, + "learning_rate": 6.865999999999999e-07, + "loss": 0.0066, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4701 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.5, + "epoch": 0.6269333333333333, + "grad_norm": 7.904031753540039, + "kl": 0.17431640625, + "learning_rate": 6.865333333333333e-07, + "loss": 0.007, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4702 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.6875, + "epoch": 0.6270666666666667, + "grad_norm": 6.317661762237549, + "kl": 0.37109375, + "learning_rate": 6.864666666666666e-07, + "loss": 0.0148, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4703 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.6875, + "epoch": 0.6272, + "grad_norm": 3.834533929824829, + "kl": 0.18017578125, + "learning_rate": 6.864e-07, + "loss": 0.0072, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4704 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.6875, + "epoch": 0.6273333333333333, + "grad_norm": 7.9886651039123535, + "kl": 0.2666015625, + "learning_rate": 6.863333333333333e-07, + "loss": 0.0107, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 4705 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.25, + "epoch": 0.6274666666666666, + "grad_norm": 0.35540738701820374, + "kl": 0.17724609375, + "learning_rate": 6.862666666666667e-07, + "loss": 0.0071, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 4706 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.5625, + "epoch": 0.6276, + "grad_norm": 5.598479747772217, + "kl": 0.2197265625, + "learning_rate": 6.862e-07, + "loss": 0.0088, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4707 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.4375, + "epoch": 0.6277333333333334, + "grad_norm": 7.186022758483887, + "kl": 0.2236328125, + "learning_rate": 6.861333333333334e-07, + "loss": 0.0089, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 4708 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.25, + "epoch": 0.6278666666666667, + "grad_norm": 43.16375732421875, + "kl": 0.419921875, + "learning_rate": 6.860666666666667e-07, + "loss": 0.0168, + "reward": 1.4375, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4709 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.75, + "epoch": 0.628, + "grad_norm": 9.847208023071289, + "kl": 0.189453125, + "learning_rate": 6.86e-07, + "loss": 0.0076, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4710 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.6875, + "epoch": 0.6281333333333333, + "grad_norm": 27.386268615722656, + "kl": 0.19384765625, + "learning_rate": 6.859333333333333e-07, + "loss": 0.0077, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4711 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.3125, + "epoch": 0.6282666666666666, + "grad_norm": 37.42302703857422, + "kl": 0.2021484375, + "learning_rate": 6.858666666666666e-07, + "loss": 0.0081, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4712 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.25, + "epoch": 0.6284, + "grad_norm": 7.3795623779296875, + "kl": 0.15283203125, + "learning_rate": 6.858e-07, + "loss": 0.0061, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4713 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.4375, + "epoch": 0.6285333333333334, + "grad_norm": 7.491910934448242, + "kl": 0.1865234375, + "learning_rate": 6.857333333333333e-07, + "loss": 0.0075, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4714 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.1875, + "epoch": 0.6286666666666667, + "grad_norm": 0.33580946922302246, + "kl": 0.17919921875, + "learning_rate": 6.856666666666667e-07, + "loss": 0.0072, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4715 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.0625, + "epoch": 0.6288, + "grad_norm": 4.390381813049316, + "kl": 0.23828125, + "learning_rate": 6.855999999999999e-07, + "loss": 0.0095, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4716 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.9375, + "epoch": 0.6289333333333333, + "grad_norm": 14.894247055053711, + "kl": 0.21484375, + "learning_rate": 6.855333333333333e-07, + "loss": 0.0086, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4717 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.625, + "epoch": 0.6290666666666667, + "grad_norm": 6.75700044631958, + "kl": 0.23486328125, + "learning_rate": 6.854666666666666e-07, + "loss": 0.0094, + "reward": 1.25, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 4718 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.4375, + "epoch": 0.6292, + "grad_norm": 7.482954502105713, + "kl": 0.169921875, + "learning_rate": 6.853999999999999e-07, + "loss": 0.0068, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4719 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.375, + "epoch": 0.6293333333333333, + "grad_norm": 4.7000732421875, + "kl": 0.17626953125, + "learning_rate": 6.853333333333333e-07, + "loss": 0.0071, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4720 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.4375, + "epoch": 0.6294666666666666, + "grad_norm": 7.375242710113525, + "kl": 0.20361328125, + "learning_rate": 6.852666666666666e-07, + "loss": 0.0081, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4721 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.25, + "epoch": 0.6296, + "grad_norm": 55.252193450927734, + "kl": 3.3056640625, + "learning_rate": 6.852e-07, + "loss": 0.1317, + "reward": 1.5625, + "reward_std": 0.7216846346855164, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 4722 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.75, + "epoch": 0.6297333333333334, + "grad_norm": 5.8302903175354, + "kl": 0.1767578125, + "learning_rate": 6.851333333333333e-07, + "loss": 0.0071, + "reward": 1.4375, + "reward_std": 0.7499763667583466, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 4723 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.875, + "epoch": 0.6298666666666667, + "grad_norm": 20.897205352783203, + "kl": 1.03564453125, + "learning_rate": 6.850666666666667e-07, + "loss": 0.0415, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 4724 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.0625, + "epoch": 0.63, + "grad_norm": 4.103687763214111, + "kl": 0.20458984375, + "learning_rate": 6.85e-07, + "loss": 0.0082, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 4725 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.1875, + "epoch": 0.6301333333333333, + "grad_norm": 8.738473892211914, + "kl": 0.17333984375, + "learning_rate": 6.849333333333333e-07, + "loss": 0.0069, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.875, + "step": 4726 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.25, + "epoch": 0.6302666666666666, + "grad_norm": 5.682011127471924, + "kl": 0.22705078125, + "learning_rate": 6.848666666666666e-07, + "loss": 0.0091, + "reward": 0.9375, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.875, + "step": 4727 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.625, + "epoch": 0.6304, + "grad_norm": 1.7136255502700806, + "kl": 0.26220703125, + "learning_rate": 6.847999999999999e-07, + "loss": 0.0105, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4728 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.3125, + "epoch": 0.6305333333333333, + "grad_norm": 4.513340950012207, + "kl": 0.208984375, + "learning_rate": 6.847333333333333e-07, + "loss": 0.0084, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4729 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.3125, + "epoch": 0.6306666666666667, + "grad_norm": 6.5583176612854, + "kl": 0.2509765625, + "learning_rate": 6.846666666666666e-07, + "loss": 0.01, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 4730 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.875, + "epoch": 0.6308, + "grad_norm": 7.005410671234131, + "kl": 0.17626953125, + "learning_rate": 6.846e-07, + "loss": 0.007, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4731 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.25, + "epoch": 0.6309333333333333, + "grad_norm": 9.349011421203613, + "kl": 0.14306640625, + "learning_rate": 6.845333333333333e-07, + "loss": 0.0057, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4732 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.375, + "epoch": 0.6310666666666667, + "grad_norm": 0.3116394281387329, + "kl": 0.2158203125, + "learning_rate": 6.844666666666667e-07, + "loss": 0.0086, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4733 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.625, + "epoch": 0.6312, + "grad_norm": 3.2982163429260254, + "kl": 0.141357421875, + "learning_rate": 6.844e-07, + "loss": 0.0057, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4734 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.0, + "epoch": 0.6313333333333333, + "grad_norm": 6.0662007331848145, + "kl": 0.154296875, + "learning_rate": 6.843333333333334e-07, + "loss": 0.0062, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 4735 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.25, + "epoch": 0.6314666666666666, + "grad_norm": 8.48488998413086, + "kl": 0.228515625, + "learning_rate": 6.842666666666667e-07, + "loss": 0.0091, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.75, + "step": 4736 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.8125, + "epoch": 0.6316, + "grad_norm": 7.0102081298828125, + "kl": 0.2900390625, + "learning_rate": 6.842e-07, + "loss": 0.0116, + "reward": 1.6875, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.875, + "step": 4737 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.1875, + "epoch": 0.6317333333333334, + "grad_norm": 3.9453840255737305, + "kl": 0.14990234375, + "learning_rate": 6.841333333333333e-07, + "loss": 0.006, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 4738 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.5625, + "epoch": 0.6318666666666667, + "grad_norm": 8.41317081451416, + "kl": 0.23974609375, + "learning_rate": 6.840666666666666e-07, + "loss": 0.0096, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4739 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.1875, + "epoch": 0.632, + "grad_norm": 4.262097358703613, + "kl": 0.17724609375, + "learning_rate": 6.84e-07, + "loss": 0.0071, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4740 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.875, + "epoch": 0.6321333333333333, + "grad_norm": 6.738330364227295, + "kl": 0.1787109375, + "learning_rate": 6.839333333333332e-07, + "loss": 0.0071, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4741 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.6875, + "epoch": 0.6322666666666666, + "grad_norm": 8.203278541564941, + "kl": 0.24853515625, + "learning_rate": 6.838666666666666e-07, + "loss": 0.01, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4742 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.8125, + "epoch": 0.6324, + "grad_norm": 8.355859756469727, + "kl": 0.2265625, + "learning_rate": 6.837999999999999e-07, + "loss": 0.0091, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4743 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.5625, + "epoch": 0.6325333333333333, + "grad_norm": 39.73149490356445, + "kl": 0.26904296875, + "learning_rate": 6.837333333333333e-07, + "loss": 0.0108, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 4744 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.3125, + "epoch": 0.6326666666666667, + "grad_norm": 6.860358238220215, + "kl": 0.14892578125, + "learning_rate": 6.836666666666666e-07, + "loss": 0.006, + "reward": 1.5625, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4745 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.75, + "epoch": 0.6328, + "grad_norm": 13.034178733825684, + "kl": 0.24169921875, + "learning_rate": 6.836e-07, + "loss": 0.0097, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.8125, + "step": 4746 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.3125, + "epoch": 0.6329333333333333, + "grad_norm": 4.247931957244873, + "kl": 0.197265625, + "learning_rate": 6.835333333333333e-07, + "loss": 0.0079, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4747 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.6875, + "epoch": 0.6330666666666667, + "grad_norm": 4.037827491760254, + "kl": 0.27734375, + "learning_rate": 6.834666666666666e-07, + "loss": 0.0111, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 4748 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.8125, + "epoch": 0.6332, + "grad_norm": 6.306044101715088, + "kl": 0.248046875, + "learning_rate": 6.834e-07, + "loss": 0.0099, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4749 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.5, + "epoch": 0.6333333333333333, + "grad_norm": 6.426931858062744, + "kl": 0.1748046875, + "learning_rate": 6.833333333333333e-07, + "loss": 0.007, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4750 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.875, + "epoch": 0.6334666666666666, + "grad_norm": 7.350614547729492, + "kl": 0.1982421875, + "learning_rate": 6.832666666666667e-07, + "loss": 0.0079, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 4751 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.125, + "epoch": 0.6336, + "grad_norm": 7.449317455291748, + "kl": 0.3310546875, + "learning_rate": 6.832e-07, + "loss": 0.0132, + "reward": 1.3125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 4752 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.5, + "epoch": 0.6337333333333334, + "grad_norm": 10.070070266723633, + "kl": 0.2060546875, + "learning_rate": 6.831333333333334e-07, + "loss": 0.0082, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4753 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.125, + "epoch": 0.6338666666666667, + "grad_norm": 8.300990104675293, + "kl": 0.1943359375, + "learning_rate": 6.830666666666667e-07, + "loss": 0.0078, + "reward": 1.3125, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 4754 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.5, + "epoch": 0.634, + "grad_norm": 6.7703070640563965, + "kl": 0.25537109375, + "learning_rate": 6.830000000000001e-07, + "loss": 0.0102, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4755 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.25, + "epoch": 0.6341333333333333, + "grad_norm": 7.6181230545043945, + "kl": 0.1865234375, + "learning_rate": 6.829333333333333e-07, + "loss": 0.0075, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4756 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.0, + "epoch": 0.6342666666666666, + "grad_norm": 8.994882583618164, + "kl": 0.267578125, + "learning_rate": 6.828666666666665e-07, + "loss": 0.0107, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4757 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.625, + "epoch": 0.6344, + "grad_norm": 7.5374860763549805, + "kl": 0.232421875, + "learning_rate": 6.827999999999999e-07, + "loss": 0.0093, + "reward": 1.625, + "reward_std": 0.6943650841712952, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.8125, + "step": 4758 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.375, + "epoch": 0.6345333333333333, + "grad_norm": 4.886191368103027, + "kl": 0.1669921875, + "learning_rate": 6.827333333333332e-07, + "loss": 0.0067, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4759 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.5, + "epoch": 0.6346666666666667, + "grad_norm": 0.5575668215751648, + "kl": 0.31689453125, + "learning_rate": 6.826666666666666e-07, + "loss": 0.0127, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4760 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.6875, + "epoch": 0.6348, + "grad_norm": 6.726316928863525, + "kl": 0.20263671875, + "learning_rate": 6.825999999999999e-07, + "loss": 0.0081, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4761 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.3125, + "epoch": 0.6349333333333333, + "grad_norm": 6.466181755065918, + "kl": 0.16455078125, + "learning_rate": 6.825333333333333e-07, + "loss": 0.0066, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4762 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.1875, + "epoch": 0.6350666666666667, + "grad_norm": 0.576538622379303, + "kl": 0.24560546875, + "learning_rate": 6.824666666666666e-07, + "loss": 0.0098, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4763 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.5, + "epoch": 0.6352, + "grad_norm": 8.930429458618164, + "kl": 0.22998046875, + "learning_rate": 6.824e-07, + "loss": 0.0092, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4764 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.25, + "epoch": 0.6353333333333333, + "grad_norm": 11.5562162399292, + "kl": 0.3603515625, + "learning_rate": 6.823333333333333e-07, + "loss": 0.0144, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4765 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.0625, + "epoch": 0.6354666666666666, + "grad_norm": 6.229418754577637, + "kl": 0.20458984375, + "learning_rate": 6.822666666666666e-07, + "loss": 0.0082, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 4766 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.75, + "epoch": 0.6356, + "grad_norm": 6.840510845184326, + "kl": 0.16162109375, + "learning_rate": 6.822e-07, + "loss": 0.0065, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4767 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.5, + "epoch": 0.6357333333333334, + "grad_norm": 11.704100608825684, + "kl": 0.2392578125, + "learning_rate": 6.821333333333333e-07, + "loss": 0.0096, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4768 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.5, + "epoch": 0.6358666666666667, + "grad_norm": 8.286408424377441, + "kl": 0.23291015625, + "learning_rate": 6.820666666666667e-07, + "loss": 0.0093, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4769 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.875, + "epoch": 0.636, + "grad_norm": 0.743850827217102, + "kl": 0.20361328125, + "learning_rate": 6.82e-07, + "loss": 0.0081, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4770 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.0625, + "epoch": 0.6361333333333333, + "grad_norm": 8.835793495178223, + "kl": 0.48876953125, + "learning_rate": 6.819333333333333e-07, + "loss": 0.0195, + "reward": 1.0, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.8125, + "step": 4771 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.75, + "epoch": 0.6362666666666666, + "grad_norm": 9.387849807739258, + "kl": 0.31982421875, + "learning_rate": 6.818666666666666e-07, + "loss": 0.0128, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 4772 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.1875, + "epoch": 0.6364, + "grad_norm": 6.628021717071533, + "kl": 0.2333984375, + "learning_rate": 6.818e-07, + "loss": 0.0094, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4773 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.1875, + "epoch": 0.6365333333333333, + "grad_norm": 9.062941551208496, + "kl": 0.24658203125, + "learning_rate": 6.817333333333333e-07, + "loss": 0.0099, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4774 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.625, + "epoch": 0.6366666666666667, + "grad_norm": 8.357036590576172, + "kl": 0.26611328125, + "learning_rate": 6.816666666666666e-07, + "loss": 0.0107, + "reward": 1.25, + "reward_std": 0.6924468874931335, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 4775 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.1875, + "epoch": 0.6368, + "grad_norm": 10.26742172241211, + "kl": 0.27734375, + "learning_rate": 6.816e-07, + "loss": 0.0111, + "reward": 1.5625, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 4776 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.4375, + "epoch": 0.6369333333333334, + "grad_norm": 5.891622066497803, + "kl": 0.26513671875, + "learning_rate": 6.815333333333333e-07, + "loss": 0.0106, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4777 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.9375, + "epoch": 0.6370666666666667, + "grad_norm": 19.52133560180664, + "kl": 0.21923828125, + "learning_rate": 6.814666666666667e-07, + "loss": 0.0088, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 4778 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.3125, + "epoch": 0.6372, + "grad_norm": 5.901082515716553, + "kl": 0.208984375, + "learning_rate": 6.814e-07, + "loss": 0.0084, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4779 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.4375, + "epoch": 0.6373333333333333, + "grad_norm": 7.5164899826049805, + "kl": 0.2373046875, + "learning_rate": 6.813333333333333e-07, + "loss": 0.0095, + "reward": 1.0625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 4780 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.125, + "epoch": 0.6374666666666666, + "grad_norm": 7.1107177734375, + "kl": 0.20947265625, + "learning_rate": 6.812666666666666e-07, + "loss": 0.0084, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4781 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.6875, + "epoch": 0.6376, + "grad_norm": 6.50661563873291, + "kl": 0.25830078125, + "learning_rate": 6.812e-07, + "loss": 0.0103, + "reward": 0.875, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.8125, + "step": 4782 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.4375, + "epoch": 0.6377333333333334, + "grad_norm": 7.745901107788086, + "kl": 0.1845703125, + "learning_rate": 6.811333333333333e-07, + "loss": 0.0074, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4783 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.375, + "epoch": 0.6378666666666667, + "grad_norm": 4.830611228942871, + "kl": 0.25537109375, + "learning_rate": 6.810666666666667e-07, + "loss": 0.0102, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4784 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.4375, + "epoch": 0.638, + "grad_norm": 6.8594207763671875, + "kl": 0.2705078125, + "learning_rate": 6.81e-07, + "loss": 0.0108, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 4785 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.4375, + "epoch": 0.6381333333333333, + "grad_norm": 10.195659637451172, + "kl": 0.19873046875, + "learning_rate": 6.809333333333332e-07, + "loss": 0.008, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4786 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.75, + "epoch": 0.6382666666666666, + "grad_norm": 6.098860263824463, + "kl": 0.1865234375, + "learning_rate": 6.808666666666666e-07, + "loss": 0.0075, + "reward": 1.25, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 4787 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.25, + "epoch": 0.6384, + "grad_norm": 5.130894660949707, + "kl": 0.13232421875, + "learning_rate": 6.807999999999999e-07, + "loss": 0.0053, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4788 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.1875, + "epoch": 0.6385333333333333, + "grad_norm": 9.788566589355469, + "kl": 0.26220703125, + "learning_rate": 6.807333333333333e-07, + "loss": 0.0105, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4789 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.375, + "epoch": 0.6386666666666667, + "grad_norm": 10.673611640930176, + "kl": 0.2060546875, + "learning_rate": 6.806666666666666e-07, + "loss": 0.0082, + "reward": 1.625, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4790 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.25, + "epoch": 0.6388, + "grad_norm": 4.950587749481201, + "kl": 0.283203125, + "learning_rate": 6.806e-07, + "loss": 0.0113, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4791 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.1875, + "epoch": 0.6389333333333334, + "grad_norm": 39.62998580932617, + "kl": 0.1884765625, + "learning_rate": 6.805333333333333e-07, + "loss": 0.0075, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 4792 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.125, + "epoch": 0.6390666666666667, + "grad_norm": 8.487863540649414, + "kl": 0.23486328125, + "learning_rate": 6.804666666666667e-07, + "loss": 0.0094, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4793 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.3125, + "epoch": 0.6392, + "grad_norm": 0.8941927552223206, + "kl": 0.173828125, + "learning_rate": 6.804e-07, + "loss": 0.0069, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4794 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.5625, + "epoch": 0.6393333333333333, + "grad_norm": 5.540919303894043, + "kl": 0.24951171875, + "learning_rate": 6.803333333333333e-07, + "loss": 0.01, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4795 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.5, + "epoch": 0.6394666666666666, + "grad_norm": 7.85786247253418, + "kl": 0.283203125, + "learning_rate": 6.802666666666667e-07, + "loss": 0.0113, + "reward": 1.625, + "reward_std": 0.6348394006490707, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 4796 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.375, + "epoch": 0.6396, + "grad_norm": 5.28358793258667, + "kl": 0.162109375, + "learning_rate": 6.802e-07, + "loss": 0.0065, + "reward": 1.0, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.875, + "step": 4797 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.5625, + "epoch": 0.6397333333333334, + "grad_norm": 6.2302327156066895, + "kl": 0.1591796875, + "learning_rate": 6.801333333333334e-07, + "loss": 0.0064, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4798 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.8125, + "epoch": 0.6398666666666667, + "grad_norm": 7.949600696563721, + "kl": 0.2705078125, + "learning_rate": 6.800666666666667e-07, + "loss": 0.0108, + "reward": 1.125, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 4799 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.25, + "epoch": 0.64, + "grad_norm": 4.700380802154541, + "kl": 0.216796875, + "learning_rate": 6.800000000000001e-07, + "loss": 0.0087, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4800 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.25, + "epoch": 0.6401333333333333, + "grad_norm": 9.132411003112793, + "kl": 0.31201171875, + "learning_rate": 6.799333333333332e-07, + "loss": 0.0125, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 4801 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.3125, + "epoch": 0.6402666666666667, + "grad_norm": 5.980921745300293, + "kl": 0.265625, + "learning_rate": 6.798666666666666e-07, + "loss": 0.0106, + "reward": 1.6875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 4802 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.3125, + "epoch": 0.6404, + "grad_norm": 4.6340742111206055, + "kl": 0.2705078125, + "learning_rate": 6.797999999999999e-07, + "loss": 0.0108, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4803 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.9375, + "epoch": 0.6405333333333333, + "grad_norm": 3.131864309310913, + "kl": 0.182861328125, + "learning_rate": 6.797333333333332e-07, + "loss": 0.0073, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4804 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.4375, + "epoch": 0.6406666666666667, + "grad_norm": 8.826027870178223, + "kl": 0.1708984375, + "learning_rate": 6.796666666666666e-07, + "loss": 0.0068, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4805 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.125, + "epoch": 0.6408, + "grad_norm": 10.606400489807129, + "kl": 0.1962890625, + "learning_rate": 6.795999999999999e-07, + "loss": 0.0078, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4806 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.9375, + "epoch": 0.6409333333333334, + "grad_norm": 4.483954906463623, + "kl": 0.287109375, + "learning_rate": 6.795333333333333e-07, + "loss": 0.0115, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4807 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.875, + "epoch": 0.6410666666666667, + "grad_norm": 5.077282428741455, + "kl": 0.1806640625, + "learning_rate": 6.794666666666666e-07, + "loss": 0.0072, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 4808 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.3125, + "epoch": 0.6412, + "grad_norm": 4.9493584632873535, + "kl": 0.19189453125, + "learning_rate": 6.794e-07, + "loss": 0.0077, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 4809 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.125, + "epoch": 0.6413333333333333, + "grad_norm": 8.30848217010498, + "kl": 0.4423828125, + "learning_rate": 6.793333333333333e-07, + "loss": 0.0177, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 4810 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.9375, + "epoch": 0.6414666666666666, + "grad_norm": 8.395946502685547, + "kl": 0.23876953125, + "learning_rate": 6.792666666666667e-07, + "loss": 0.0096, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4811 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.625, + "epoch": 0.6416, + "grad_norm": 6.863739013671875, + "kl": 0.21435546875, + "learning_rate": 6.792e-07, + "loss": 0.0086, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4812 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.3125, + "epoch": 0.6417333333333334, + "grad_norm": 5.6705427169799805, + "kl": 0.220703125, + "learning_rate": 6.791333333333333e-07, + "loss": 0.0088, + "reward": 1.0625, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 4813 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.625, + "epoch": 0.6418666666666667, + "grad_norm": 9.96398639678955, + "kl": 0.2783203125, + "learning_rate": 6.790666666666667e-07, + "loss": 0.0111, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 4814 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.5, + "epoch": 0.642, + "grad_norm": 6.293842315673828, + "kl": 0.21923828125, + "learning_rate": 6.79e-07, + "loss": 0.0088, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4815 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.9375, + "epoch": 0.6421333333333333, + "grad_norm": 8.950760841369629, + "kl": 0.318359375, + "learning_rate": 6.789333333333334e-07, + "loss": 0.0127, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4816 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.9375, + "epoch": 0.6422666666666667, + "grad_norm": 5.117428779602051, + "kl": 0.3017578125, + "learning_rate": 6.788666666666666e-07, + "loss": 0.012, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4817 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.8125, + "epoch": 0.6424, + "grad_norm": 4.641177654266357, + "kl": 0.19287109375, + "learning_rate": 6.788e-07, + "loss": 0.0077, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4818 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.4375, + "epoch": 0.6425333333333333, + "grad_norm": 7.656569480895996, + "kl": 0.279296875, + "learning_rate": 6.787333333333333e-07, + "loss": 0.0111, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4819 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.1875, + "epoch": 0.6426666666666667, + "grad_norm": 7.467403411865234, + "kl": 0.20703125, + "learning_rate": 6.786666666666667e-07, + "loss": 0.0083, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 4820 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.375, + "epoch": 0.6428, + "grad_norm": 8.225841522216797, + "kl": 0.189453125, + "learning_rate": 6.786e-07, + "loss": 0.0076, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4821 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.875, + "epoch": 0.6429333333333334, + "grad_norm": 6.127058029174805, + "kl": 0.2001953125, + "learning_rate": 6.785333333333332e-07, + "loss": 0.008, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4822 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.6875, + "epoch": 0.6430666666666667, + "grad_norm": 6.220033645629883, + "kl": 0.25146484375, + "learning_rate": 6.784666666666666e-07, + "loss": 0.0101, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 4823 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.0625, + "epoch": 0.6432, + "grad_norm": 10.386346817016602, + "kl": 0.255859375, + "learning_rate": 6.783999999999999e-07, + "loss": 0.0103, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4824 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.0, + "epoch": 0.6433333333333333, + "grad_norm": 8.22334098815918, + "kl": 0.2568359375, + "learning_rate": 6.783333333333333e-07, + "loss": 0.0103, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 4825 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.5625, + "epoch": 0.6434666666666666, + "grad_norm": 4.842483043670654, + "kl": 0.2314453125, + "learning_rate": 6.782666666666666e-07, + "loss": 0.0093, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4826 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.375, + "epoch": 0.6436, + "grad_norm": 6.591144561767578, + "kl": 0.2080078125, + "learning_rate": 6.782e-07, + "loss": 0.0083, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4827 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.875, + "epoch": 0.6437333333333334, + "grad_norm": 6.929938793182373, + "kl": 0.1484375, + "learning_rate": 6.781333333333333e-07, + "loss": 0.006, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4828 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.0, + "epoch": 0.6438666666666667, + "grad_norm": 0.5331937670707703, + "kl": 0.19384765625, + "learning_rate": 6.780666666666667e-07, + "loss": 0.0078, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4829 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.75, + "epoch": 0.644, + "grad_norm": 4.65748405456543, + "kl": 0.18310546875, + "learning_rate": 6.78e-07, + "loss": 0.0073, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4830 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.4375, + "epoch": 0.6441333333333333, + "grad_norm": 5.7468061447143555, + "kl": 0.22412109375, + "learning_rate": 6.779333333333334e-07, + "loss": 0.009, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 4831 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.1875, + "epoch": 0.6442666666666667, + "grad_norm": 5.781530857086182, + "kl": 0.16796875, + "learning_rate": 6.778666666666666e-07, + "loss": 0.0067, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4832 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.6875, + "epoch": 0.6444, + "grad_norm": 3.7298691272735596, + "kl": 0.2119140625, + "learning_rate": 6.777999999999999e-07, + "loss": 0.0085, + "reward": 1.4375, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 4833 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.0625, + "epoch": 0.6445333333333333, + "grad_norm": 7.712064743041992, + "kl": 0.16455078125, + "learning_rate": 6.777333333333333e-07, + "loss": 0.0066, + "reward": 1.3125, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 4834 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.9375, + "epoch": 0.6446666666666667, + "grad_norm": 0.3997657597064972, + "kl": 0.23046875, + "learning_rate": 6.776666666666666e-07, + "loss": 0.0092, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4835 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.8125, + "epoch": 0.6448, + "grad_norm": 6.24120569229126, + "kl": 0.19384765625, + "learning_rate": 6.776e-07, + "loss": 0.0078, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 4836 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.375, + "epoch": 0.6449333333333334, + "grad_norm": 78.1246337890625, + "kl": 0.19091796875, + "learning_rate": 6.775333333333333e-07, + "loss": 0.0077, + "reward": 1.1875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.8125, + "step": 4837 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.75, + "epoch": 0.6450666666666667, + "grad_norm": 8.917814254760742, + "kl": 0.17041015625, + "learning_rate": 6.774666666666667e-07, + "loss": 0.0068, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4838 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.25, + "epoch": 0.6452, + "grad_norm": 6.005443096160889, + "kl": 0.19677734375, + "learning_rate": 6.774e-07, + "loss": 0.0079, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4839 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.25, + "epoch": 0.6453333333333333, + "grad_norm": 6.1245436668396, + "kl": 0.18115234375, + "learning_rate": 6.773333333333334e-07, + "loss": 0.0072, + "reward": 1.1875, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 4840 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.6875, + "epoch": 0.6454666666666666, + "grad_norm": 7.003738880157471, + "kl": 0.24365234375, + "learning_rate": 6.772666666666667e-07, + "loss": 0.0097, + "reward": 1.25, + "reward_std": 0.6452257037162781, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 4841 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.25, + "epoch": 0.6456, + "grad_norm": 4.606637954711914, + "kl": 0.23388671875, + "learning_rate": 6.772e-07, + "loss": 0.0094, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4842 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.4375, + "epoch": 0.6457333333333334, + "grad_norm": 5.733161926269531, + "kl": 0.2626953125, + "learning_rate": 6.771333333333333e-07, + "loss": 0.0105, + "reward": 1.0625, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.875, + "step": 4843 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.5, + "epoch": 0.6458666666666667, + "grad_norm": 12.730903625488281, + "kl": 0.26806640625, + "learning_rate": 6.770666666666666e-07, + "loss": 0.0107, + "reward": 1.5625, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 4844 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.75, + "epoch": 0.646, + "grad_norm": 8.887799263000488, + "kl": 0.22216796875, + "learning_rate": 6.77e-07, + "loss": 0.0089, + "reward": 1.5, + "reward_std": 0.6452257037162781, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4845 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.4375, + "epoch": 0.6461333333333333, + "grad_norm": 5.950397968292236, + "kl": 0.140625, + "learning_rate": 6.769333333333333e-07, + "loss": 0.0056, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4846 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.75, + "epoch": 0.6462666666666667, + "grad_norm": 4.6708574295043945, + "kl": 0.18017578125, + "learning_rate": 6.768666666666666e-07, + "loss": 0.0072, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4847 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.0, + "epoch": 0.6464, + "grad_norm": 9.9948091506958, + "kl": 0.1806640625, + "learning_rate": 6.767999999999999e-07, + "loss": 0.0072, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4848 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.0, + "epoch": 0.6465333333333333, + "grad_norm": 8.981426239013672, + "kl": 0.2548828125, + "learning_rate": 6.767333333333333e-07, + "loss": 0.0102, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4849 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.75, + "epoch": 0.6466666666666666, + "grad_norm": 7.566578388214111, + "kl": 0.1943359375, + "learning_rate": 6.766666666666666e-07, + "loss": 0.0078, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4850 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.9375, + "epoch": 0.6468, + "grad_norm": 9.622711181640625, + "kl": 0.5166015625, + "learning_rate": 6.765999999999999e-07, + "loss": 0.0206, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4851 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.3125, + "epoch": 0.6469333333333334, + "grad_norm": 8.205312728881836, + "kl": 0.16796875, + "learning_rate": 6.765333333333333e-07, + "loss": 0.0067, + "reward": 1.6875, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 4852 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.875, + "epoch": 0.6470666666666667, + "grad_norm": 6.115651607513428, + "kl": 0.185546875, + "learning_rate": 6.764666666666666e-07, + "loss": 0.0074, + "reward": 1.3125, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 4853 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.5625, + "epoch": 0.6472, + "grad_norm": 7.389400005340576, + "kl": 0.1962890625, + "learning_rate": 6.764e-07, + "loss": 0.0079, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4854 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.75, + "epoch": 0.6473333333333333, + "grad_norm": 4.320909023284912, + "kl": 0.17724609375, + "learning_rate": 6.763333333333333e-07, + "loss": 0.0071, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4855 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.0625, + "epoch": 0.6474666666666666, + "grad_norm": 6.289999961853027, + "kl": 0.232421875, + "learning_rate": 6.762666666666667e-07, + "loss": 0.0093, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4856 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.0, + "epoch": 0.6476, + "grad_norm": 4.993790149688721, + "kl": 0.20166015625, + "learning_rate": 6.762e-07, + "loss": 0.0081, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4857 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.0, + "epoch": 0.6477333333333334, + "grad_norm": 6.186489105224609, + "kl": 0.1845703125, + "learning_rate": 6.761333333333334e-07, + "loss": 0.0074, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4858 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.25, + "epoch": 0.6478666666666667, + "grad_norm": 5.07059383392334, + "kl": 0.18603515625, + "learning_rate": 6.760666666666667e-07, + "loss": 0.0075, + "reward": 1.5625, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4859 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.5, + "epoch": 0.648, + "grad_norm": 6.397017002105713, + "kl": 0.15478515625, + "learning_rate": 6.76e-07, + "loss": 0.0062, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4860 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.5625, + "epoch": 0.6481333333333333, + "grad_norm": 8.700577735900879, + "kl": 0.1826171875, + "learning_rate": 6.759333333333334e-07, + "loss": 0.0073, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4861 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.6875, + "epoch": 0.6482666666666667, + "grad_norm": 4.959262847900391, + "kl": 0.2724609375, + "learning_rate": 6.758666666666666e-07, + "loss": 0.0109, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4862 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.5625, + "epoch": 0.6484, + "grad_norm": 5.314138412475586, + "kl": 0.18603515625, + "learning_rate": 6.758e-07, + "loss": 0.0074, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4863 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.0625, + "epoch": 0.6485333333333333, + "grad_norm": 6.2254252433776855, + "kl": 0.21337890625, + "learning_rate": 6.757333333333332e-07, + "loss": 0.0085, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4864 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.4375, + "epoch": 0.6486666666666666, + "grad_norm": 6.526933193206787, + "kl": 0.18701171875, + "learning_rate": 6.756666666666666e-07, + "loss": 0.0075, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4865 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.625, + "epoch": 0.6488, + "grad_norm": 10.248577117919922, + "kl": 0.18798828125, + "learning_rate": 6.755999999999999e-07, + "loss": 0.0075, + "reward": 1.1875, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 4866 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.0, + "epoch": 0.6489333333333334, + "grad_norm": 5.82078218460083, + "kl": 0.30517578125, + "learning_rate": 6.755333333333333e-07, + "loss": 0.0122, + "reward": 1.375, + "reward_std": 0.7892733812332153, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.8125, + "step": 4867 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.6875, + "epoch": 0.6490666666666667, + "grad_norm": 3.8193206787109375, + "kl": 0.20947265625, + "learning_rate": 6.754666666666666e-07, + "loss": 0.0084, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4868 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.125, + "epoch": 0.6492, + "grad_norm": 5.041746616363525, + "kl": 0.2001953125, + "learning_rate": 6.753999999999999e-07, + "loss": 0.008, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4869 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.3125, + "epoch": 0.6493333333333333, + "grad_norm": 5.694819450378418, + "kl": 0.19580078125, + "learning_rate": 6.753333333333333e-07, + "loss": 0.0078, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4870 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.0625, + "epoch": 0.6494666666666666, + "grad_norm": 7.824845790863037, + "kl": 0.1865234375, + "learning_rate": 6.752666666666666e-07, + "loss": 0.0075, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4871 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.125, + "epoch": 0.6496, + "grad_norm": 46.96224594116211, + "kl": 0.15576171875, + "learning_rate": 6.752e-07, + "loss": 0.0062, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4872 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.0625, + "epoch": 0.6497333333333334, + "grad_norm": 6.864575386047363, + "kl": 0.23828125, + "learning_rate": 6.751333333333333e-07, + "loss": 0.0095, + "reward": 1.0625, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 4873 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.375, + "epoch": 0.6498666666666667, + "grad_norm": 11.416213989257812, + "kl": 0.56982421875, + "learning_rate": 6.750666666666667e-07, + "loss": 0.0228, + "reward": 1.625, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4874 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.3125, + "epoch": 0.65, + "grad_norm": 7.65495491027832, + "kl": 0.22607421875, + "learning_rate": 6.75e-07, + "loss": 0.0091, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4875 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.25, + "epoch": 0.6501333333333333, + "grad_norm": 23.475830078125, + "kl": 0.8291015625, + "learning_rate": 6.749333333333334e-07, + "loss": 0.0331, + "reward": 1.0625, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.875, + "step": 4876 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.9375, + "epoch": 0.6502666666666667, + "grad_norm": 15.600449562072754, + "kl": 0.21533203125, + "learning_rate": 6.748666666666666e-07, + "loss": 0.0086, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 4877 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.0625, + "epoch": 0.6504, + "grad_norm": 8.114553451538086, + "kl": 0.390625, + "learning_rate": 6.747999999999999e-07, + "loss": 0.0156, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 4878 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.5, + "epoch": 0.6505333333333333, + "grad_norm": 13.76401138305664, + "kl": 0.47265625, + "learning_rate": 6.747333333333333e-07, + "loss": 0.0189, + "reward": 1.5, + "reward_std": 0.7440237998962402, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.8125, + "step": 4879 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.3125, + "epoch": 0.6506666666666666, + "grad_norm": 7.098784923553467, + "kl": 0.21435546875, + "learning_rate": 6.746666666666666e-07, + "loss": 0.0086, + "reward": 1.0625, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 4880 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.0, + "epoch": 0.6508, + "grad_norm": 5.208499908447266, + "kl": 0.2529296875, + "learning_rate": 6.746e-07, + "loss": 0.0101, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4881 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.6875, + "epoch": 0.6509333333333334, + "grad_norm": 14.358799934387207, + "kl": 0.1865234375, + "learning_rate": 6.745333333333333e-07, + "loss": 0.0074, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4882 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.4375, + "epoch": 0.6510666666666667, + "grad_norm": 6.054689884185791, + "kl": 0.23876953125, + "learning_rate": 6.744666666666667e-07, + "loss": 0.0095, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4883 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.0625, + "epoch": 0.6512, + "grad_norm": 7.068310737609863, + "kl": 0.1904296875, + "learning_rate": 6.744e-07, + "loss": 0.0076, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4884 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.4375, + "epoch": 0.6513333333333333, + "grad_norm": 4.154273986816406, + "kl": 0.181640625, + "learning_rate": 6.743333333333333e-07, + "loss": 0.0073, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4885 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.0, + "epoch": 0.6514666666666666, + "grad_norm": 5.696682453155518, + "kl": 0.19775390625, + "learning_rate": 6.742666666666666e-07, + "loss": 0.0079, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4886 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.625, + "epoch": 0.6516, + "grad_norm": 8.25047779083252, + "kl": 0.23779296875, + "learning_rate": 6.742e-07, + "loss": 0.0095, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4887 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.3125, + "epoch": 0.6517333333333334, + "grad_norm": 5.396240234375, + "kl": 0.2021484375, + "learning_rate": 6.741333333333333e-07, + "loss": 0.0081, + "reward": 1.6875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 4888 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.4375, + "epoch": 0.6518666666666667, + "grad_norm": 6.188131332397461, + "kl": 0.17724609375, + "learning_rate": 6.740666666666666e-07, + "loss": 0.0071, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4889 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.75, + "epoch": 0.652, + "grad_norm": 6.977083206176758, + "kl": 0.220703125, + "learning_rate": 6.74e-07, + "loss": 0.0088, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4890 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.6875, + "epoch": 0.6521333333333333, + "grad_norm": 6.727902889251709, + "kl": 0.3173828125, + "learning_rate": 6.739333333333333e-07, + "loss": 0.0127, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4891 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.375, + "epoch": 0.6522666666666667, + "grad_norm": 10.03739070892334, + "kl": 0.20703125, + "learning_rate": 6.738666666666666e-07, + "loss": 0.0083, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 4892 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.5625, + "epoch": 0.6524, + "grad_norm": 5.788607120513916, + "kl": 0.20703125, + "learning_rate": 6.737999999999999e-07, + "loss": 0.0083, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4893 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.625, + "epoch": 0.6525333333333333, + "grad_norm": 3.776160955429077, + "kl": 0.2294921875, + "learning_rate": 6.737333333333333e-07, + "loss": 0.0092, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4894 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.6875, + "epoch": 0.6526666666666666, + "grad_norm": 8.69021987915039, + "kl": 0.2333984375, + "learning_rate": 6.736666666666666e-07, + "loss": 0.0093, + "reward": 1.4375, + "reward_std": 0.6487165093421936, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 4895 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.6875, + "epoch": 0.6528, + "grad_norm": 6.792179584503174, + "kl": 0.3408203125, + "learning_rate": 6.736e-07, + "loss": 0.0137, + "reward": 1.1875, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 4896 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.875, + "epoch": 0.6529333333333334, + "grad_norm": 7.1063008308410645, + "kl": 0.22216796875, + "learning_rate": 6.735333333333333e-07, + "loss": 0.0089, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4897 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.0, + "epoch": 0.6530666666666667, + "grad_norm": 3.2546887397766113, + "kl": 0.17578125, + "learning_rate": 6.734666666666666e-07, + "loss": 0.007, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4898 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.625, + "epoch": 0.6532, + "grad_norm": 10.588884353637695, + "kl": 0.529296875, + "learning_rate": 6.734e-07, + "loss": 0.0212, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 4899 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.3125, + "epoch": 0.6533333333333333, + "grad_norm": 4.868190765380859, + "kl": 0.2138671875, + "learning_rate": 6.733333333333333e-07, + "loss": 0.0086, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4900 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.125, + "epoch": 0.6534666666666666, + "grad_norm": 8.148101806640625, + "kl": 0.337890625, + "learning_rate": 6.732666666666667e-07, + "loss": 0.0135, + "reward": 1.4375, + "reward_std": 0.7253239452838898, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 4901 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.4375, + "epoch": 0.6536, + "grad_norm": 7.006053924560547, + "kl": 0.171875, + "learning_rate": 6.732e-07, + "loss": 0.0069, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 4902 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.0625, + "epoch": 0.6537333333333334, + "grad_norm": 6.51782751083374, + "kl": 0.17138671875, + "learning_rate": 6.731333333333334e-07, + "loss": 0.0069, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 4903 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.625, + "epoch": 0.6538666666666667, + "grad_norm": 3.367656707763672, + "kl": 0.17236328125, + "learning_rate": 6.730666666666667e-07, + "loss": 0.0069, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 4904 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.8125, + "epoch": 0.654, + "grad_norm": 6.525534629821777, + "kl": 0.143310546875, + "learning_rate": 6.730000000000001e-07, + "loss": 0.0057, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4905 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.5, + "epoch": 0.6541333333333333, + "grad_norm": 3.975397825241089, + "kl": 0.1484375, + "learning_rate": 6.729333333333334e-07, + "loss": 0.0059, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4906 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.125, + "epoch": 0.6542666666666667, + "grad_norm": 4.773612976074219, + "kl": 0.14794921875, + "learning_rate": 6.728666666666665e-07, + "loss": 0.0059, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 4907 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.875, + "epoch": 0.6544, + "grad_norm": 0.4511917531490326, + "kl": 0.283203125, + "learning_rate": 6.727999999999999e-07, + "loss": 0.0113, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4908 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.4375, + "epoch": 0.6545333333333333, + "grad_norm": 8.26906681060791, + "kl": 0.29638671875, + "learning_rate": 6.727333333333332e-07, + "loss": 0.0118, + "reward": 1.5625, + "reward_std": 0.7499763667583466, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 4909 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.3125, + "epoch": 0.6546666666666666, + "grad_norm": 9.140854835510254, + "kl": 0.175537109375, + "learning_rate": 6.726666666666666e-07, + "loss": 0.007, + "reward": 1.1875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 4910 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.0625, + "epoch": 0.6548, + "grad_norm": 5.571792125701904, + "kl": 0.1484375, + "learning_rate": 6.725999999999999e-07, + "loss": 0.0059, + "reward": 1.25, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.8125, + "step": 4911 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.4375, + "epoch": 0.6549333333333334, + "grad_norm": 9.612812995910645, + "kl": 0.16650390625, + "learning_rate": 6.725333333333333e-07, + "loss": 0.0067, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4912 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.0625, + "epoch": 0.6550666666666667, + "grad_norm": 7.779406547546387, + "kl": 0.2705078125, + "learning_rate": 6.724666666666666e-07, + "loss": 0.0108, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4913 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.5625, + "epoch": 0.6552, + "grad_norm": 6.605146884918213, + "kl": 0.20751953125, + "learning_rate": 6.724e-07, + "loss": 0.0083, + "reward": 0.9375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.875, + "step": 4914 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.875, + "epoch": 0.6553333333333333, + "grad_norm": 5.656046390533447, + "kl": 0.2685546875, + "learning_rate": 6.723333333333333e-07, + "loss": 0.0108, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 4915 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.125, + "epoch": 0.6554666666666666, + "grad_norm": 4.248782634735107, + "kl": 0.19189453125, + "learning_rate": 6.722666666666666e-07, + "loss": 0.0077, + "reward": 1.0, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 4916 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.0, + "epoch": 0.6556, + "grad_norm": 4.256356716156006, + "kl": 0.2197265625, + "learning_rate": 6.722e-07, + "loss": 0.0088, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4917 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.375, + "epoch": 0.6557333333333333, + "grad_norm": 3.559635639190674, + "kl": 0.14013671875, + "learning_rate": 6.721333333333333e-07, + "loss": 0.0056, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4918 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.25, + "epoch": 0.6558666666666667, + "grad_norm": 9.041232109069824, + "kl": 0.17431640625, + "learning_rate": 6.720666666666667e-07, + "loss": 0.007, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4919 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.5, + "epoch": 0.656, + "grad_norm": 6.706234931945801, + "kl": 0.1708984375, + "learning_rate": 6.72e-07, + "loss": 0.0068, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4920 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.125, + "epoch": 0.6561333333333333, + "grad_norm": 5.623392105102539, + "kl": 0.177734375, + "learning_rate": 6.719333333333334e-07, + "loss": 0.0071, + "reward": 1.25, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 4921 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.75, + "epoch": 0.6562666666666667, + "grad_norm": 7.459902763366699, + "kl": 0.16845703125, + "learning_rate": 6.718666666666666e-07, + "loss": 0.0067, + "reward": 1.125, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 4922 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.5, + "epoch": 0.6564, + "grad_norm": 6.375085830688477, + "kl": 0.2294921875, + "learning_rate": 6.718e-07, + "loss": 0.0092, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 4923 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.3125, + "epoch": 0.6565333333333333, + "grad_norm": 7.125194072723389, + "kl": 0.18017578125, + "learning_rate": 6.717333333333333e-07, + "loss": 0.0072, + "reward": 1.0625, + "reward_std": 0.7833450436592102, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.75, + "step": 4924 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.25, + "epoch": 0.6566666666666666, + "grad_norm": 7.128632068634033, + "kl": 0.19580078125, + "learning_rate": 6.716666666666666e-07, + "loss": 0.0078, + "reward": 1.375, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 4925 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.25, + "epoch": 0.6568, + "grad_norm": 5.932346820831299, + "kl": 0.2158203125, + "learning_rate": 6.716e-07, + "loss": 0.0086, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 4926 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.0, + "epoch": 0.6569333333333334, + "grad_norm": 5.766166687011719, + "kl": 0.1611328125, + "learning_rate": 6.715333333333332e-07, + "loss": 0.0064, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4927 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.9375, + "epoch": 0.6570666666666667, + "grad_norm": 6.728548049926758, + "kl": 0.18798828125, + "learning_rate": 6.714666666666666e-07, + "loss": 0.0075, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4928 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.125, + "epoch": 0.6572, + "grad_norm": 4.194400310516357, + "kl": 0.18408203125, + "learning_rate": 6.713999999999999e-07, + "loss": 0.0073, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4929 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.375, + "epoch": 0.6573333333333333, + "grad_norm": 5.773045539855957, + "kl": 0.2216796875, + "learning_rate": 6.713333333333333e-07, + "loss": 0.0089, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4930 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.1875, + "epoch": 0.6574666666666666, + "grad_norm": 4.2489447593688965, + "kl": 0.21044921875, + "learning_rate": 6.712666666666666e-07, + "loss": 0.0084, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 4931 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.25, + "epoch": 0.6576, + "grad_norm": 6.425819396972656, + "kl": 0.1923828125, + "learning_rate": 6.712e-07, + "loss": 0.0077, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 4932 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.3125, + "epoch": 0.6577333333333333, + "grad_norm": 45.8774299621582, + "kl": 0.17724609375, + "learning_rate": 6.711333333333333e-07, + "loss": 0.0071, + "reward": 1.25, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 4933 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.75, + "epoch": 0.6578666666666667, + "grad_norm": 9.359580993652344, + "kl": 0.2333984375, + "learning_rate": 6.710666666666667e-07, + "loss": 0.0093, + "reward": 1.5, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4934 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.5625, + "epoch": 0.658, + "grad_norm": 6.257704734802246, + "kl": 0.1708984375, + "learning_rate": 6.71e-07, + "loss": 0.0068, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4935 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.6875, + "epoch": 0.6581333333333333, + "grad_norm": 5.346451282501221, + "kl": 0.22802734375, + "learning_rate": 6.709333333333333e-07, + "loss": 0.0091, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 4936 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.75, + "epoch": 0.6582666666666667, + "grad_norm": 5.48309326171875, + "kl": 0.21240234375, + "learning_rate": 6.708666666666666e-07, + "loss": 0.0085, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4937 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.6875, + "epoch": 0.6584, + "grad_norm": 4.395689010620117, + "kl": 0.19140625, + "learning_rate": 6.707999999999999e-07, + "loss": 0.0077, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4938 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.375, + "epoch": 0.6585333333333333, + "grad_norm": 8.701895713806152, + "kl": 0.185546875, + "learning_rate": 6.707333333333333e-07, + "loss": 0.0074, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 4939 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.25, + "epoch": 0.6586666666666666, + "grad_norm": 8.147085189819336, + "kl": 0.181640625, + "learning_rate": 6.706666666666666e-07, + "loss": 0.0073, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4940 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.875, + "epoch": 0.6588, + "grad_norm": 13.270419120788574, + "kl": 0.408203125, + "learning_rate": 6.706e-07, + "loss": 0.0163, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 4941 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.375, + "epoch": 0.6589333333333334, + "grad_norm": 3.7700905799865723, + "kl": 0.171875, + "learning_rate": 6.705333333333333e-07, + "loss": 0.0069, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 4942 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.875, + "epoch": 0.6590666666666667, + "grad_norm": 0.37315165996551514, + "kl": 0.13525390625, + "learning_rate": 6.704666666666667e-07, + "loss": 0.0054, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4943 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.1875, + "epoch": 0.6592, + "grad_norm": 4.039682388305664, + "kl": 0.15087890625, + "learning_rate": 6.704e-07, + "loss": 0.006, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4944 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.5625, + "epoch": 0.6593333333333333, + "grad_norm": 5.940569877624512, + "kl": 0.13916015625, + "learning_rate": 6.703333333333333e-07, + "loss": 0.0056, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4945 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.125, + "epoch": 0.6594666666666666, + "grad_norm": 7.085262775421143, + "kl": 0.2236328125, + "learning_rate": 6.702666666666667e-07, + "loss": 0.0089, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4946 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.6875, + "epoch": 0.6596, + "grad_norm": 8.663301467895508, + "kl": 0.2041015625, + "learning_rate": 6.702e-07, + "loss": 0.0082, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4947 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.3125, + "epoch": 0.6597333333333333, + "grad_norm": 5.793340682983398, + "kl": 0.1640625, + "learning_rate": 6.701333333333334e-07, + "loss": 0.0066, + "reward": 1.1875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 4948 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.25, + "epoch": 0.6598666666666667, + "grad_norm": 6.648059368133545, + "kl": 0.18212890625, + "learning_rate": 6.700666666666666e-07, + "loss": 0.0073, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4949 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.375, + "epoch": 0.66, + "grad_norm": 8.422852516174316, + "kl": 0.17919921875, + "learning_rate": 6.7e-07, + "loss": 0.0072, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4950 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.4375, + "epoch": 0.6601333333333333, + "grad_norm": 7.973820686340332, + "kl": 0.27001953125, + "learning_rate": 6.699333333333333e-07, + "loss": 0.0108, + "reward": 1.3125, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 4951 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.625, + "epoch": 0.6602666666666667, + "grad_norm": 6.696588516235352, + "kl": 0.1962890625, + "learning_rate": 6.698666666666667e-07, + "loss": 0.0079, + "reward": 1.125, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 4952 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.75, + "epoch": 0.6604, + "grad_norm": 4.299405097961426, + "kl": 0.23193359375, + "learning_rate": 6.697999999999999e-07, + "loss": 0.0093, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4953 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.25, + "epoch": 0.6605333333333333, + "grad_norm": 12.65510368347168, + "kl": 0.4560546875, + "learning_rate": 6.697333333333332e-07, + "loss": 0.0182, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4954 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.3125, + "epoch": 0.6606666666666666, + "grad_norm": 4.448477745056152, + "kl": 0.17822265625, + "learning_rate": 6.696666666666666e-07, + "loss": 0.0071, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4955 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.8125, + "epoch": 0.6608, + "grad_norm": 3.978189468383789, + "kl": 0.2177734375, + "learning_rate": 6.695999999999999e-07, + "loss": 0.0087, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4956 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.75, + "epoch": 0.6609333333333334, + "grad_norm": 6.386631965637207, + "kl": 0.2392578125, + "learning_rate": 6.695333333333333e-07, + "loss": 0.0096, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 4957 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.75, + "epoch": 0.6610666666666667, + "grad_norm": 9.222162246704102, + "kl": 0.17138671875, + "learning_rate": 6.694666666666666e-07, + "loss": 0.0069, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4958 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.0, + "epoch": 0.6612, + "grad_norm": 10.4269380569458, + "kl": 0.197265625, + "learning_rate": 6.694e-07, + "loss": 0.0079, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4959 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.25, + "epoch": 0.6613333333333333, + "grad_norm": 8.455401420593262, + "kl": 0.220703125, + "learning_rate": 6.693333333333333e-07, + "loss": 0.0088, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.875, + "step": 4960 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.9375, + "epoch": 0.6614666666666666, + "grad_norm": 6.522287368774414, + "kl": 0.2255859375, + "learning_rate": 6.692666666666667e-07, + "loss": 0.009, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.875, + "step": 4961 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.375, + "epoch": 0.6616, + "grad_norm": 6.512232303619385, + "kl": 0.193359375, + "learning_rate": 6.692e-07, + "loss": 0.0077, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4962 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.125, + "epoch": 0.6617333333333333, + "grad_norm": 5.843291282653809, + "kl": 0.2060546875, + "learning_rate": 6.691333333333333e-07, + "loss": 0.0082, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 4963 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.6875, + "epoch": 0.6618666666666667, + "grad_norm": 0.3550069034099579, + "kl": 0.22216796875, + "learning_rate": 6.690666666666667e-07, + "loss": 0.0089, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4964 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.6875, + "epoch": 0.662, + "grad_norm": 7.043411731719971, + "kl": 0.20068359375, + "learning_rate": 6.69e-07, + "loss": 0.008, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4965 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.0, + "epoch": 0.6621333333333334, + "grad_norm": 8.304357528686523, + "kl": 0.197265625, + "learning_rate": 6.689333333333334e-07, + "loss": 0.0079, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 4966 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.1875, + "epoch": 0.6622666666666667, + "grad_norm": 9.215298652648926, + "kl": 0.2294921875, + "learning_rate": 6.688666666666667e-07, + "loss": 0.0092, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 4967 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.3125, + "epoch": 0.6624, + "grad_norm": 4.943840980529785, + "kl": 0.201171875, + "learning_rate": 6.688e-07, + "loss": 0.0081, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 4968 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.5, + "epoch": 0.6625333333333333, + "grad_norm": 5.276886940002441, + "kl": 0.2177734375, + "learning_rate": 6.687333333333332e-07, + "loss": 0.0087, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4969 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.3125, + "epoch": 0.6626666666666666, + "grad_norm": 6.5098557472229, + "kl": 0.2705078125, + "learning_rate": 6.686666666666666e-07, + "loss": 0.0108, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4970 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.5625, + "epoch": 0.6628, + "grad_norm": 4.685329914093018, + "kl": 0.1494140625, + "learning_rate": 6.685999999999999e-07, + "loss": 0.006, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 4971 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.9375, + "epoch": 0.6629333333333334, + "grad_norm": 4.840237140655518, + "kl": 0.236328125, + "learning_rate": 6.685333333333332e-07, + "loss": 0.0094, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 4972 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.875, + "epoch": 0.6630666666666667, + "grad_norm": 6.532092094421387, + "kl": 0.14794921875, + "learning_rate": 6.684666666666666e-07, + "loss": 0.0059, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4973 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.5625, + "epoch": 0.6632, + "grad_norm": 7.144848346710205, + "kl": 0.18994140625, + "learning_rate": 6.683999999999999e-07, + "loss": 0.0076, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 4974 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.625, + "epoch": 0.6633333333333333, + "grad_norm": 8.645892143249512, + "kl": 0.1884765625, + "learning_rate": 6.683333333333333e-07, + "loss": 0.0075, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4975 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.9375, + "epoch": 0.6634666666666666, + "grad_norm": 0.3583800196647644, + "kl": 0.1943359375, + "learning_rate": 6.682666666666666e-07, + "loss": 0.0078, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4976 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.0, + "epoch": 0.6636, + "grad_norm": 1.8710010051727295, + "kl": 0.21044921875, + "learning_rate": 6.682e-07, + "loss": 0.0084, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4977 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.0, + "epoch": 0.6637333333333333, + "grad_norm": 7.1694817543029785, + "kl": 0.18212890625, + "learning_rate": 6.681333333333333e-07, + "loss": 0.0073, + "reward": 1.375, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4978 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.75, + "epoch": 0.6638666666666667, + "grad_norm": 5.036263942718506, + "kl": 0.17529296875, + "learning_rate": 6.680666666666667e-07, + "loss": 0.007, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 4979 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.5, + "epoch": 0.664, + "grad_norm": 6.420443058013916, + "kl": 0.23876953125, + "learning_rate": 6.68e-07, + "loss": 0.0096, + "reward": 1.4375, + "reward_std": 0.6487165093421936, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 4980 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.625, + "epoch": 0.6641333333333334, + "grad_norm": 5.6731085777282715, + "kl": 0.17919921875, + "learning_rate": 6.679333333333334e-07, + "loss": 0.0072, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 4981 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.375, + "epoch": 0.6642666666666667, + "grad_norm": 6.125817775726318, + "kl": 0.17626953125, + "learning_rate": 6.678666666666667e-07, + "loss": 0.0071, + "reward": 1.625, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 4982 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.5625, + "epoch": 0.6644, + "grad_norm": 3.02449631690979, + "kl": 0.224365234375, + "learning_rate": 6.677999999999999e-07, + "loss": 0.009, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 4983 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.625, + "epoch": 0.6645333333333333, + "grad_norm": 9.228706359863281, + "kl": 0.287109375, + "learning_rate": 6.677333333333333e-07, + "loss": 0.0115, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4984 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.625, + "epoch": 0.6646666666666666, + "grad_norm": 4.576854705810547, + "kl": 0.2529296875, + "learning_rate": 6.676666666666666e-07, + "loss": 0.0101, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 4985 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.0625, + "epoch": 0.6648, + "grad_norm": 8.8344087600708, + "kl": 0.27392578125, + "learning_rate": 6.676e-07, + "loss": 0.011, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 4986 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.0, + "epoch": 0.6649333333333334, + "grad_norm": 4.984444618225098, + "kl": 0.173828125, + "learning_rate": 6.675333333333333e-07, + "loss": 0.007, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 4987 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.25, + "epoch": 0.6650666666666667, + "grad_norm": 3.6961007118225098, + "kl": 0.1611328125, + "learning_rate": 6.674666666666667e-07, + "loss": 0.0064, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4988 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.1875, + "epoch": 0.6652, + "grad_norm": 2.980605125427246, + "kl": 0.1953125, + "learning_rate": 6.674e-07, + "loss": 0.0078, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 4989 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.9375, + "epoch": 0.6653333333333333, + "grad_norm": 7.181760787963867, + "kl": 0.18798828125, + "learning_rate": 6.673333333333334e-07, + "loss": 0.0075, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4990 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.4375, + "epoch": 0.6654666666666667, + "grad_norm": 10.682826042175293, + "kl": 0.3818359375, + "learning_rate": 6.672666666666666e-07, + "loss": 0.0153, + "reward": 1.4375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 4991 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.6875, + "epoch": 0.6656, + "grad_norm": 6.767422676086426, + "kl": 0.310546875, + "learning_rate": 6.671999999999999e-07, + "loss": 0.0124, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 4992 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.25, + "epoch": 0.6657333333333333, + "grad_norm": 8.815357208251953, + "kl": 0.20654296875, + "learning_rate": 6.671333333333333e-07, + "loss": 0.0083, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 4993 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.5625, + "epoch": 0.6658666666666667, + "grad_norm": 7.982615947723389, + "kl": 0.22265625, + "learning_rate": 6.670666666666666e-07, + "loss": 0.0089, + "reward": 1.375, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 4994 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.4375, + "epoch": 0.666, + "grad_norm": 3.6696667671203613, + "kl": 0.20849609375, + "learning_rate": 6.67e-07, + "loss": 0.0084, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 4995 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.8125, + "epoch": 0.6661333333333334, + "grad_norm": 0.4867393374443054, + "kl": 0.26953125, + "learning_rate": 6.669333333333333e-07, + "loss": 0.0108, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4996 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.125, + "epoch": 0.6662666666666667, + "grad_norm": 3.4957306385040283, + "kl": 0.1875, + "learning_rate": 6.668666666666667e-07, + "loss": 0.0075, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 4997 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.1875, + "epoch": 0.6664, + "grad_norm": 7.430747985839844, + "kl": 0.21875, + "learning_rate": 6.667999999999999e-07, + "loss": 0.0088, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 4998 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.0625, + "epoch": 0.6665333333333333, + "grad_norm": 6.136590480804443, + "kl": 0.2373046875, + "learning_rate": 6.667333333333333e-07, + "loss": 0.0095, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 4999 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.3125, + "epoch": 0.6666666666666666, + "grad_norm": 5.814762115478516, + "kl": 0.17626953125, + "learning_rate": 6.666666666666666e-07, + "loss": 0.007, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5000 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.8125, + "epoch": 0.6668, + "grad_norm": 8.696027755737305, + "kl": 0.29736328125, + "learning_rate": 6.665999999999999e-07, + "loss": 0.0119, + "reward": 1.3125, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.8125, + "step": 5001 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.625, + "epoch": 0.6669333333333334, + "grad_norm": 5.669267654418945, + "kl": 0.26806640625, + "learning_rate": 6.665333333333333e-07, + "loss": 0.0107, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 5002 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.0, + "epoch": 0.6670666666666667, + "grad_norm": 12.06875228881836, + "kl": 0.3564453125, + "learning_rate": 6.664666666666666e-07, + "loss": 0.0142, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5003 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.125, + "epoch": 0.6672, + "grad_norm": 4.779266357421875, + "kl": 0.322265625, + "learning_rate": 6.664e-07, + "loss": 0.0129, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5004 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.5, + "epoch": 0.6673333333333333, + "grad_norm": 5.554072380065918, + "kl": 0.19189453125, + "learning_rate": 6.663333333333333e-07, + "loss": 0.0077, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5005 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.25, + "epoch": 0.6674666666666667, + "grad_norm": 4.737905502319336, + "kl": 0.220703125, + "learning_rate": 6.662666666666667e-07, + "loss": 0.0088, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5006 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.125, + "epoch": 0.6676, + "grad_norm": 4.570505619049072, + "kl": 0.158447265625, + "learning_rate": 6.662e-07, + "loss": 0.0064, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5007 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.8125, + "epoch": 0.6677333333333333, + "grad_norm": 8.808292388916016, + "kl": 0.205078125, + "learning_rate": 6.661333333333334e-07, + "loss": 0.0082, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5008 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.0625, + "epoch": 0.6678666666666667, + "grad_norm": 8.438344955444336, + "kl": 0.279296875, + "learning_rate": 6.660666666666667e-07, + "loss": 0.0112, + "reward": 1.1875, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 5009 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.4375, + "epoch": 0.668, + "grad_norm": 3.166048526763916, + "kl": 0.267578125, + "learning_rate": 6.66e-07, + "loss": 0.0107, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5010 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.25, + "epoch": 0.6681333333333334, + "grad_norm": 7.942447662353516, + "kl": 0.23388671875, + "learning_rate": 6.659333333333334e-07, + "loss": 0.0094, + "reward": 1.4375, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5011 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.625, + "epoch": 0.6682666666666667, + "grad_norm": 5.884252548217773, + "kl": 0.2578125, + "learning_rate": 6.658666666666666e-07, + "loss": 0.0103, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5012 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.75, + "epoch": 0.6684, + "grad_norm": 23.088422775268555, + "kl": 0.17333984375, + "learning_rate": 6.657999999999999e-07, + "loss": 0.0069, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5013 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.6875, + "epoch": 0.6685333333333333, + "grad_norm": 5.494575500488281, + "kl": 0.19970703125, + "learning_rate": 6.657333333333332e-07, + "loss": 0.008, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5014 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.125, + "epoch": 0.6686666666666666, + "grad_norm": 6.780130863189697, + "kl": 0.29638671875, + "learning_rate": 6.656666666666666e-07, + "loss": 0.0119, + "reward": 1.125, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 5015 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.0625, + "epoch": 0.6688, + "grad_norm": 8.72717571258545, + "kl": 0.2470703125, + "learning_rate": 6.655999999999999e-07, + "loss": 0.0099, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5016 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.8125, + "epoch": 0.6689333333333334, + "grad_norm": 4.793899059295654, + "kl": 0.23193359375, + "learning_rate": 6.655333333333333e-07, + "loss": 0.0093, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5017 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.5625, + "epoch": 0.6690666666666667, + "grad_norm": 16.912364959716797, + "kl": 0.16552734375, + "learning_rate": 6.654666666666666e-07, + "loss": 0.0066, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5018 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.3125, + "epoch": 0.6692, + "grad_norm": 7.484317779541016, + "kl": 0.19140625, + "learning_rate": 6.653999999999999e-07, + "loss": 0.0077, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5019 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.6875, + "epoch": 0.6693333333333333, + "grad_norm": 7.132925510406494, + "kl": 0.2822265625, + "learning_rate": 6.653333333333333e-07, + "loss": 0.0113, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5020 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.5625, + "epoch": 0.6694666666666667, + "grad_norm": 17.56513786315918, + "kl": 0.7255859375, + "learning_rate": 6.652666666666666e-07, + "loss": 0.0291, + "reward": 1.6875, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 5021 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.9375, + "epoch": 0.6696, + "grad_norm": 6.948188781738281, + "kl": 0.22021484375, + "learning_rate": 6.652e-07, + "loss": 0.0088, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5022 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.375, + "epoch": 0.6697333333333333, + "grad_norm": 4.076338768005371, + "kl": 0.20947265625, + "learning_rate": 6.651333333333333e-07, + "loss": 0.0084, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5023 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.0625, + "epoch": 0.6698666666666667, + "grad_norm": 4.366171836853027, + "kl": 0.21923828125, + "learning_rate": 6.650666666666667e-07, + "loss": 0.0088, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5024 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.9375, + "epoch": 0.67, + "grad_norm": 5.47548770904541, + "kl": 0.169921875, + "learning_rate": 6.65e-07, + "loss": 0.0068, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5025 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.0, + "epoch": 0.6701333333333334, + "grad_norm": 4.49560546875, + "kl": 0.22021484375, + "learning_rate": 6.649333333333334e-07, + "loss": 0.0088, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5026 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.0, + "epoch": 0.6702666666666667, + "grad_norm": 9.261268615722656, + "kl": 0.23583984375, + "learning_rate": 6.648666666666667e-07, + "loss": 0.0094, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5027 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.0, + "epoch": 0.6704, + "grad_norm": 4.236723899841309, + "kl": 0.1787109375, + "learning_rate": 6.647999999999999e-07, + "loss": 0.0072, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5028 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.1875, + "epoch": 0.6705333333333333, + "grad_norm": 6.8026556968688965, + "kl": 0.3359375, + "learning_rate": 6.647333333333333e-07, + "loss": 0.0134, + "reward": 1.125, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.875, + "step": 5029 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.1875, + "epoch": 0.6706666666666666, + "grad_norm": 9.847421646118164, + "kl": 0.18310546875, + "learning_rate": 6.646666666666666e-07, + "loss": 0.0073, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 5030 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.8125, + "epoch": 0.6708, + "grad_norm": 7.024716854095459, + "kl": 0.22314453125, + "learning_rate": 6.646e-07, + "loss": 0.0089, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 5031 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.875, + "epoch": 0.6709333333333334, + "grad_norm": 7.148196697235107, + "kl": 0.21826171875, + "learning_rate": 6.645333333333332e-07, + "loss": 0.0087, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5032 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.9375, + "epoch": 0.6710666666666667, + "grad_norm": 8.495783805847168, + "kl": 0.19189453125, + "learning_rate": 6.644666666666666e-07, + "loss": 0.0077, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5033 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.75, + "epoch": 0.6712, + "grad_norm": 0.4163222312927246, + "kl": 0.1943359375, + "learning_rate": 6.643999999999999e-07, + "loss": 0.0078, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5034 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.1875, + "epoch": 0.6713333333333333, + "grad_norm": 4.971634864807129, + "kl": 0.19384765625, + "learning_rate": 6.643333333333333e-07, + "loss": 0.0078, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 5035 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.75, + "epoch": 0.6714666666666667, + "grad_norm": 11.62204647064209, + "kl": 0.2158203125, + "learning_rate": 6.642666666666666e-07, + "loss": 0.0086, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 5036 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.9375, + "epoch": 0.6716, + "grad_norm": 9.963953018188477, + "kl": 0.169921875, + "learning_rate": 6.642e-07, + "loss": 0.0068, + "reward": 1.0625, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 5037 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.5625, + "epoch": 0.6717333333333333, + "grad_norm": 0.42118918895721436, + "kl": 0.18896484375, + "learning_rate": 6.641333333333333e-07, + "loss": 0.0076, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 5038 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.625, + "epoch": 0.6718666666666666, + "grad_norm": 6.734068393707275, + "kl": 0.2353515625, + "learning_rate": 6.640666666666666e-07, + "loss": 0.0094, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5039 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.75, + "epoch": 0.672, + "grad_norm": 7.972269535064697, + "kl": 0.18896484375, + "learning_rate": 6.64e-07, + "loss": 0.0076, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5040 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.4375, + "epoch": 0.6721333333333334, + "grad_norm": 7.033408164978027, + "kl": 0.171875, + "learning_rate": 6.639333333333333e-07, + "loss": 0.0069, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5041 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.3125, + "epoch": 0.6722666666666667, + "grad_norm": 5.116366863250732, + "kl": 0.25390625, + "learning_rate": 6.638666666666667e-07, + "loss": 0.0102, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5042 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.875, + "epoch": 0.6724, + "grad_norm": 7.08353328704834, + "kl": 0.193359375, + "learning_rate": 6.637999999999999e-07, + "loss": 0.0077, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5043 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.0625, + "epoch": 0.6725333333333333, + "grad_norm": 9.463587760925293, + "kl": 0.28955078125, + "learning_rate": 6.637333333333333e-07, + "loss": 0.0116, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 5044 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.0625, + "epoch": 0.6726666666666666, + "grad_norm": 12.817028999328613, + "kl": 0.3369140625, + "learning_rate": 6.636666666666666e-07, + "loss": 0.0135, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5045 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.625, + "epoch": 0.6728, + "grad_norm": 4.031876564025879, + "kl": 0.2041015625, + "learning_rate": 6.636e-07, + "loss": 0.0082, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5046 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.1875, + "epoch": 0.6729333333333334, + "grad_norm": 6.597187042236328, + "kl": 0.20556640625, + "learning_rate": 6.635333333333333e-07, + "loss": 0.0082, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5047 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.1875, + "epoch": 0.6730666666666667, + "grad_norm": 7.9872846603393555, + "kl": 0.22802734375, + "learning_rate": 6.634666666666666e-07, + "loss": 0.0091, + "reward": 1.4375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5048 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.9375, + "epoch": 0.6732, + "grad_norm": 25.807048797607422, + "kl": 0.2119140625, + "learning_rate": 6.634e-07, + "loss": 0.0085, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5049 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.8125, + "epoch": 0.6733333333333333, + "grad_norm": 8.612305641174316, + "kl": 0.2138671875, + "learning_rate": 6.633333333333333e-07, + "loss": 0.0085, + "reward": 1.125, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 5050 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.75, + "epoch": 0.6734666666666667, + "grad_norm": 4.222599506378174, + "kl": 0.2314453125, + "learning_rate": 6.632666666666667e-07, + "loss": 0.0093, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5051 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.0, + "epoch": 0.6736, + "grad_norm": 5.6207990646362305, + "kl": 0.20068359375, + "learning_rate": 6.632e-07, + "loss": 0.008, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5052 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.25, + "epoch": 0.6737333333333333, + "grad_norm": 6.147931098937988, + "kl": 0.15478515625, + "learning_rate": 6.631333333333334e-07, + "loss": 0.0062, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 5053 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.5, + "epoch": 0.6738666666666666, + "grad_norm": 6.715574264526367, + "kl": 0.24560546875, + "learning_rate": 6.630666666666666e-07, + "loss": 0.0098, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5054 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.0, + "epoch": 0.674, + "grad_norm": 7.199361801147461, + "kl": 0.18798828125, + "learning_rate": 6.63e-07, + "loss": 0.0075, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 5055 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.3125, + "epoch": 0.6741333333333334, + "grad_norm": 5.590661525726318, + "kl": 0.21337890625, + "learning_rate": 6.629333333333333e-07, + "loss": 0.0085, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5056 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.1875, + "epoch": 0.6742666666666667, + "grad_norm": 7.259356498718262, + "kl": 0.15087890625, + "learning_rate": 6.628666666666666e-07, + "loss": 0.006, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5057 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.0625, + "epoch": 0.6744, + "grad_norm": 4.290589809417725, + "kl": 0.205078125, + "learning_rate": 6.627999999999999e-07, + "loss": 0.0082, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5058 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.0625, + "epoch": 0.6745333333333333, + "grad_norm": 4.145617961883545, + "kl": 0.197265625, + "learning_rate": 6.627333333333332e-07, + "loss": 0.0079, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5059 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.25, + "epoch": 0.6746666666666666, + "grad_norm": 3.39477801322937, + "kl": 0.19921875, + "learning_rate": 6.626666666666666e-07, + "loss": 0.008, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5060 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.75, + "epoch": 0.6748, + "grad_norm": 8.047043800354004, + "kl": 0.228515625, + "learning_rate": 6.625999999999999e-07, + "loss": 0.0091, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 5061 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.6875, + "epoch": 0.6749333333333334, + "grad_norm": 6.15949821472168, + "kl": 0.16064453125, + "learning_rate": 6.625333333333333e-07, + "loss": 0.0064, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5062 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.0, + "epoch": 0.6750666666666667, + "grad_norm": 8.834367752075195, + "kl": 0.22265625, + "learning_rate": 6.624666666666666e-07, + "loss": 0.0089, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5063 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.5, + "epoch": 0.6752, + "grad_norm": 4.931618690490723, + "kl": 0.2529296875, + "learning_rate": 6.624e-07, + "loss": 0.0101, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5064 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.9375, + "epoch": 0.6753333333333333, + "grad_norm": 11.259037017822266, + "kl": 0.22119140625, + "learning_rate": 6.623333333333333e-07, + "loss": 0.0088, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5065 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.1875, + "epoch": 0.6754666666666667, + "grad_norm": 7.0636305809021, + "kl": 0.18701171875, + "learning_rate": 6.622666666666666e-07, + "loss": 0.0075, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5066 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.75, + "epoch": 0.6756, + "grad_norm": 7.456300735473633, + "kl": 0.17822265625, + "learning_rate": 6.622e-07, + "loss": 0.0071, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5067 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.5625, + "epoch": 0.6757333333333333, + "grad_norm": 0.7336412668228149, + "kl": 0.2373046875, + "learning_rate": 6.621333333333333e-07, + "loss": 0.0095, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5068 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.9375, + "epoch": 0.6758666666666666, + "grad_norm": 0.4253988564014435, + "kl": 0.23291015625, + "learning_rate": 6.620666666666667e-07, + "loss": 0.0093, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5069 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.4375, + "epoch": 0.676, + "grad_norm": 9.106932640075684, + "kl": 0.21142578125, + "learning_rate": 6.62e-07, + "loss": 0.0085, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5070 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.0, + "epoch": 0.6761333333333334, + "grad_norm": 4.660316467285156, + "kl": 0.2177734375, + "learning_rate": 6.619333333333334e-07, + "loss": 0.0087, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 5071 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.75, + "epoch": 0.6762666666666667, + "grad_norm": 6.709441661834717, + "kl": 0.16552734375, + "learning_rate": 6.618666666666667e-07, + "loss": 0.0066, + "reward": 1.25, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 5072 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.5625, + "epoch": 0.6764, + "grad_norm": 7.996440410614014, + "kl": 0.20703125, + "learning_rate": 6.618000000000001e-07, + "loss": 0.0083, + "reward": 1.375, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5073 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.375, + "epoch": 0.6765333333333333, + "grad_norm": 8.984334945678711, + "kl": 0.173828125, + "learning_rate": 6.617333333333333e-07, + "loss": 0.0069, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5074 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.125, + "epoch": 0.6766666666666666, + "grad_norm": 7.711846828460693, + "kl": 0.203125, + "learning_rate": 6.616666666666665e-07, + "loss": 0.0081, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5075 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.8125, + "epoch": 0.6768, + "grad_norm": 5.296474456787109, + "kl": 0.2744140625, + "learning_rate": 6.615999999999999e-07, + "loss": 0.011, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5076 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.8125, + "epoch": 0.6769333333333334, + "grad_norm": 50.001251220703125, + "kl": 0.13525390625, + "learning_rate": 6.615333333333332e-07, + "loss": 0.0054, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5077 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.5, + "epoch": 0.6770666666666667, + "grad_norm": 11.914440155029297, + "kl": 0.232421875, + "learning_rate": 6.614666666666666e-07, + "loss": 0.0093, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 5078 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.5, + "epoch": 0.6772, + "grad_norm": 7.256964206695557, + "kl": 0.17529296875, + "learning_rate": 6.613999999999999e-07, + "loss": 0.007, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5079 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.8125, + "epoch": 0.6773333333333333, + "grad_norm": 6.903909206390381, + "kl": 0.2490234375, + "learning_rate": 6.613333333333333e-07, + "loss": 0.0099, + "reward": 1.0625, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.875, + "step": 5080 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.4375, + "epoch": 0.6774666666666667, + "grad_norm": 37.818763732910156, + "kl": 0.15478515625, + "learning_rate": 6.612666666666666e-07, + "loss": 0.0062, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5081 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.125, + "epoch": 0.6776, + "grad_norm": 5.370156288146973, + "kl": 0.2724609375, + "learning_rate": 6.612e-07, + "loss": 0.0109, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5082 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.6875, + "epoch": 0.6777333333333333, + "grad_norm": 5.085986137390137, + "kl": 0.22314453125, + "learning_rate": 6.611333333333333e-07, + "loss": 0.0089, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5083 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.9375, + "epoch": 0.6778666666666666, + "grad_norm": 6.601032257080078, + "kl": 0.19482421875, + "learning_rate": 6.610666666666667e-07, + "loss": 0.0078, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5084 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.0625, + "epoch": 0.678, + "grad_norm": 4.68630838394165, + "kl": 0.28857421875, + "learning_rate": 6.61e-07, + "loss": 0.0115, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5085 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.6875, + "epoch": 0.6781333333333334, + "grad_norm": 4.3427300453186035, + "kl": 0.20556640625, + "learning_rate": 6.609333333333333e-07, + "loss": 0.0082, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5086 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.25, + "epoch": 0.6782666666666667, + "grad_norm": 5.692939758300781, + "kl": 0.2548828125, + "learning_rate": 6.608666666666667e-07, + "loss": 0.0102, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5087 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.0625, + "epoch": 0.6784, + "grad_norm": 7.114565372467041, + "kl": 0.1767578125, + "learning_rate": 6.608e-07, + "loss": 0.0071, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5088 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.75, + "epoch": 0.6785333333333333, + "grad_norm": 7.8955464363098145, + "kl": 0.25, + "learning_rate": 6.607333333333333e-07, + "loss": 0.01, + "reward": 1.5, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5089 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.75, + "epoch": 0.6786666666666666, + "grad_norm": 6.77060604095459, + "kl": 0.1923828125, + "learning_rate": 6.606666666666666e-07, + "loss": 0.0077, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5090 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.1875, + "epoch": 0.6788, + "grad_norm": 12.572741508483887, + "kl": 0.16455078125, + "learning_rate": 6.606e-07, + "loss": 0.0066, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5091 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.125, + "epoch": 0.6789333333333334, + "grad_norm": 7.8166046142578125, + "kl": 0.21337890625, + "learning_rate": 6.605333333333333e-07, + "loss": 0.0085, + "reward": 1.25, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 5092 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.0625, + "epoch": 0.6790666666666667, + "grad_norm": 7.041306018829346, + "kl": 0.1982421875, + "learning_rate": 6.604666666666667e-07, + "loss": 0.0079, + "reward": 1.5625, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5093 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.875, + "epoch": 0.6792, + "grad_norm": 6.399464130401611, + "kl": 0.20849609375, + "learning_rate": 6.604e-07, + "loss": 0.0083, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5094 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.0, + "epoch": 0.6793333333333333, + "grad_norm": 7.148175239562988, + "kl": 0.24072265625, + "learning_rate": 6.603333333333333e-07, + "loss": 0.0096, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5095 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.25, + "epoch": 0.6794666666666667, + "grad_norm": 6.1878485679626465, + "kl": 0.1640625, + "learning_rate": 6.602666666666666e-07, + "loss": 0.0066, + "reward": 1.5625, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5096 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.125, + "epoch": 0.6796, + "grad_norm": 19.065372467041016, + "kl": 0.14892578125, + "learning_rate": 6.601999999999999e-07, + "loss": 0.006, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5097 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.875, + "epoch": 0.6797333333333333, + "grad_norm": 4.803472518920898, + "kl": 0.14892578125, + "learning_rate": 6.601333333333333e-07, + "loss": 0.0059, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5098 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.125, + "epoch": 0.6798666666666666, + "grad_norm": 7.464824199676514, + "kl": 0.228515625, + "learning_rate": 6.600666666666666e-07, + "loss": 0.0091, + "reward": 1.6875, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 5099 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.25, + "epoch": 0.68, + "grad_norm": 7.0020976066589355, + "kl": 0.152587890625, + "learning_rate": 6.6e-07, + "loss": 0.0061, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5100 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.8125, + "epoch": 0.6801333333333334, + "grad_norm": 3.298819065093994, + "kl": 0.17041015625, + "learning_rate": 6.599333333333333e-07, + "loss": 0.0068, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5101 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.1875, + "epoch": 0.6802666666666667, + "grad_norm": 6.095918655395508, + "kl": 0.189453125, + "learning_rate": 6.598666666666667e-07, + "loss": 0.0076, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5102 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.625, + "epoch": 0.6804, + "grad_norm": 6.360477924346924, + "kl": 0.1953125, + "learning_rate": 6.598e-07, + "loss": 0.0078, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5103 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.75, + "epoch": 0.6805333333333333, + "grad_norm": 0.4050675630569458, + "kl": 0.19287109375, + "learning_rate": 6.597333333333332e-07, + "loss": 0.0077, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5104 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.375, + "epoch": 0.6806666666666666, + "grad_norm": 7.35431432723999, + "kl": 0.2421875, + "learning_rate": 6.596666666666666e-07, + "loss": 0.0097, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5105 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.375, + "epoch": 0.6808, + "grad_norm": 6.75109338760376, + "kl": 0.19677734375, + "learning_rate": 6.595999999999999e-07, + "loss": 0.0079, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5106 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.0625, + "epoch": 0.6809333333333333, + "grad_norm": 6.190622329711914, + "kl": 0.2509765625, + "learning_rate": 6.595333333333333e-07, + "loss": 0.01, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5107 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.5625, + "epoch": 0.6810666666666667, + "grad_norm": 7.425336837768555, + "kl": 0.255859375, + "learning_rate": 6.594666666666666e-07, + "loss": 0.0102, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5108 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.5625, + "epoch": 0.6812, + "grad_norm": 7.249377727508545, + "kl": 0.20751953125, + "learning_rate": 6.594e-07, + "loss": 0.0083, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5109 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.9375, + "epoch": 0.6813333333333333, + "grad_norm": 6.790256500244141, + "kl": 0.20654296875, + "learning_rate": 6.593333333333333e-07, + "loss": 0.0083, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5110 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.125, + "epoch": 0.6814666666666667, + "grad_norm": 6.925076961517334, + "kl": 0.2099609375, + "learning_rate": 6.592666666666667e-07, + "loss": 0.0084, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5111 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.875, + "epoch": 0.6816, + "grad_norm": 16.785343170166016, + "kl": 0.2861328125, + "learning_rate": 6.592e-07, + "loss": 0.0115, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5112 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.875, + "epoch": 0.6817333333333333, + "grad_norm": 7.992170333862305, + "kl": 0.2080078125, + "learning_rate": 6.591333333333333e-07, + "loss": 0.0083, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5113 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.0625, + "epoch": 0.6818666666666666, + "grad_norm": 7.371303558349609, + "kl": 0.21533203125, + "learning_rate": 6.590666666666667e-07, + "loss": 0.0086, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5114 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.8125, + "epoch": 0.682, + "grad_norm": 5.6411566734313965, + "kl": 0.283203125, + "learning_rate": 6.59e-07, + "loss": 0.0113, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5115 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.5625, + "epoch": 0.6821333333333334, + "grad_norm": 5.301609992980957, + "kl": 0.20751953125, + "learning_rate": 6.589333333333334e-07, + "loss": 0.0083, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 5116 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.6875, + "epoch": 0.6822666666666667, + "grad_norm": 8.863112449645996, + "kl": 0.35693359375, + "learning_rate": 6.588666666666667e-07, + "loss": 0.0143, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5117 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.1875, + "epoch": 0.6824, + "grad_norm": 0.43004515767097473, + "kl": 0.22314453125, + "learning_rate": 6.588e-07, + "loss": 0.0089, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5118 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.75, + "epoch": 0.6825333333333333, + "grad_norm": 8.927469253540039, + "kl": 0.2314453125, + "learning_rate": 6.587333333333332e-07, + "loss": 0.0093, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5119 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.5, + "epoch": 0.6826666666666666, + "grad_norm": 4.07039213180542, + "kl": 0.24169921875, + "learning_rate": 6.586666666666666e-07, + "loss": 0.0097, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5120 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.5, + "epoch": 0.6828, + "grad_norm": 10.750625610351562, + "kl": 0.314453125, + "learning_rate": 6.585999999999999e-07, + "loss": 0.0126, + "reward": 0.9375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.8125, + "step": 5121 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.875, + "epoch": 0.6829333333333333, + "grad_norm": 0.4813675284385681, + "kl": 0.23583984375, + "learning_rate": 6.585333333333332e-07, + "loss": 0.0094, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5122 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.625, + "epoch": 0.6830666666666667, + "grad_norm": 5.680892467498779, + "kl": 0.2314453125, + "learning_rate": 6.584666666666666e-07, + "loss": 0.0093, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5123 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.3125, + "epoch": 0.6832, + "grad_norm": 7.58445930480957, + "kl": 0.22119140625, + "learning_rate": 6.583999999999999e-07, + "loss": 0.0089, + "reward": 1.4375, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5124 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.625, + "epoch": 0.6833333333333333, + "grad_norm": 9.021341323852539, + "kl": 0.234375, + "learning_rate": 6.583333333333333e-07, + "loss": 0.0094, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5125 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.8125, + "epoch": 0.6834666666666667, + "grad_norm": 4.921731472015381, + "kl": 0.24853515625, + "learning_rate": 6.582666666666666e-07, + "loss": 0.0099, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5126 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.6875, + "epoch": 0.6836, + "grad_norm": 8.036669731140137, + "kl": 0.2587890625, + "learning_rate": 6.582e-07, + "loss": 0.0104, + "reward": 1.4375, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5127 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.0, + "epoch": 0.6837333333333333, + "grad_norm": 7.462286949157715, + "kl": 0.2353515625, + "learning_rate": 6.581333333333333e-07, + "loss": 0.0094, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5128 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.5625, + "epoch": 0.6838666666666666, + "grad_norm": 3.7221758365631104, + "kl": 0.20263671875, + "learning_rate": 6.580666666666667e-07, + "loss": 0.0081, + "reward": 1.1875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 5129 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.1875, + "epoch": 0.684, + "grad_norm": 5.803214073181152, + "kl": 0.2939453125, + "learning_rate": 6.58e-07, + "loss": 0.0118, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5130 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.25, + "epoch": 0.6841333333333334, + "grad_norm": 9.320685386657715, + "kl": 0.283203125, + "learning_rate": 6.579333333333334e-07, + "loss": 0.0113, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5131 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.625, + "epoch": 0.6842666666666667, + "grad_norm": 6.140519142150879, + "kl": 0.1572265625, + "learning_rate": 6.578666666666667e-07, + "loss": 0.0063, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5132 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.5, + "epoch": 0.6844, + "grad_norm": 7.292617321014404, + "kl": 0.21240234375, + "learning_rate": 6.578e-07, + "loss": 0.0085, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 5133 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.0625, + "epoch": 0.6845333333333333, + "grad_norm": 6.677008628845215, + "kl": 0.34130859375, + "learning_rate": 6.577333333333333e-07, + "loss": 0.0137, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5134 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.125, + "epoch": 0.6846666666666666, + "grad_norm": 8.78188419342041, + "kl": 0.2314453125, + "learning_rate": 6.576666666666666e-07, + "loss": 0.0093, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5135 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.9375, + "epoch": 0.6848, + "grad_norm": 3.641543388366699, + "kl": 0.19921875, + "learning_rate": 6.576e-07, + "loss": 0.008, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5136 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.6875, + "epoch": 0.6849333333333333, + "grad_norm": 7.154377460479736, + "kl": 0.15478515625, + "learning_rate": 6.575333333333333e-07, + "loss": 0.0062, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5137 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.4375, + "epoch": 0.6850666666666667, + "grad_norm": 7.884796619415283, + "kl": 0.18505859375, + "learning_rate": 6.574666666666667e-07, + "loss": 0.0074, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5138 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.0, + "epoch": 0.6852, + "grad_norm": 9.465056419372559, + "kl": 0.19091796875, + "learning_rate": 6.573999999999999e-07, + "loss": 0.0076, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5139 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.1875, + "epoch": 0.6853333333333333, + "grad_norm": 6.299219131469727, + "kl": 0.15380859375, + "learning_rate": 6.573333333333333e-07, + "loss": 0.0062, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5140 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.75, + "epoch": 0.6854666666666667, + "grad_norm": 7.237424373626709, + "kl": 0.2158203125, + "learning_rate": 6.572666666666666e-07, + "loss": 0.0086, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5141 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.3125, + "epoch": 0.6856, + "grad_norm": 5.530747890472412, + "kl": 0.201171875, + "learning_rate": 6.571999999999999e-07, + "loss": 0.008, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5142 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.8125, + "epoch": 0.6857333333333333, + "grad_norm": 7.100956916809082, + "kl": 0.2509765625, + "learning_rate": 6.571333333333333e-07, + "loss": 0.0101, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5143 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.3125, + "epoch": 0.6858666666666666, + "grad_norm": 5.545714855194092, + "kl": 0.19287109375, + "learning_rate": 6.570666666666666e-07, + "loss": 0.0077, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5144 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.75, + "epoch": 0.686, + "grad_norm": 6.790167808532715, + "kl": 0.18505859375, + "learning_rate": 6.57e-07, + "loss": 0.0074, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5145 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.8125, + "epoch": 0.6861333333333334, + "grad_norm": 5.64802885055542, + "kl": 0.18359375, + "learning_rate": 6.569333333333333e-07, + "loss": 0.0073, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5146 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.25, + "epoch": 0.6862666666666667, + "grad_norm": 6.6074442863464355, + "kl": 0.2021484375, + "learning_rate": 6.568666666666667e-07, + "loss": 0.0081, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5147 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.75, + "epoch": 0.6864, + "grad_norm": 4.5170207023620605, + "kl": 0.16943359375, + "learning_rate": 6.568e-07, + "loss": 0.0068, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5148 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.5625, + "epoch": 0.6865333333333333, + "grad_norm": 5.464859485626221, + "kl": 0.1591796875, + "learning_rate": 6.567333333333333e-07, + "loss": 0.0064, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5149 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.4375, + "epoch": 0.6866666666666666, + "grad_norm": 6.761147499084473, + "kl": 0.21484375, + "learning_rate": 6.566666666666666e-07, + "loss": 0.0086, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5150 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.625, + "epoch": 0.6868, + "grad_norm": 5.418560981750488, + "kl": 0.16650390625, + "learning_rate": 6.565999999999999e-07, + "loss": 0.0067, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5151 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.1875, + "epoch": 0.6869333333333333, + "grad_norm": 6.220636367797852, + "kl": 0.20556640625, + "learning_rate": 6.565333333333333e-07, + "loss": 0.0082, + "reward": 1.3125, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 5152 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.5625, + "epoch": 0.6870666666666667, + "grad_norm": 8.707215309143066, + "kl": 0.17529296875, + "learning_rate": 6.564666666666666e-07, + "loss": 0.007, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5153 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.875, + "epoch": 0.6872, + "grad_norm": 7.94049596786499, + "kl": 0.25, + "learning_rate": 6.564e-07, + "loss": 0.01, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5154 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.25, + "epoch": 0.6873333333333334, + "grad_norm": 7.769327640533447, + "kl": 0.19921875, + "learning_rate": 6.563333333333333e-07, + "loss": 0.008, + "reward": 1.1875, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 5155 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.5, + "epoch": 0.6874666666666667, + "grad_norm": 7.629834175109863, + "kl": 0.26318359375, + "learning_rate": 6.562666666666667e-07, + "loss": 0.0105, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 5156 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.25, + "epoch": 0.6876, + "grad_norm": 7.755242347717285, + "kl": 0.2060546875, + "learning_rate": 6.562e-07, + "loss": 0.0083, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5157 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.25, + "epoch": 0.6877333333333333, + "grad_norm": 7.134635925292969, + "kl": 0.18115234375, + "learning_rate": 6.561333333333334e-07, + "loss": 0.0072, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5158 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.4375, + "epoch": 0.6878666666666666, + "grad_norm": 5.548975944519043, + "kl": 0.2392578125, + "learning_rate": 6.560666666666667e-07, + "loss": 0.0096, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5159 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.0, + "epoch": 0.688, + "grad_norm": 6.221675395965576, + "kl": 0.23974609375, + "learning_rate": 6.56e-07, + "loss": 0.0096, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5160 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.8125, + "epoch": 0.6881333333333334, + "grad_norm": 7.5056610107421875, + "kl": 0.24267578125, + "learning_rate": 6.559333333333333e-07, + "loss": 0.0097, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5161 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.125, + "epoch": 0.6882666666666667, + "grad_norm": 8.569498062133789, + "kl": 0.234375, + "learning_rate": 6.558666666666666e-07, + "loss": 0.0094, + "reward": 1.3125, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 5162 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.875, + "epoch": 0.6884, + "grad_norm": 3.9152233600616455, + "kl": 0.22412109375, + "learning_rate": 6.558e-07, + "loss": 0.009, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5163 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.5, + "epoch": 0.6885333333333333, + "grad_norm": 8.521480560302734, + "kl": 0.2021484375, + "learning_rate": 6.557333333333332e-07, + "loss": 0.0081, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5164 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.0, + "epoch": 0.6886666666666666, + "grad_norm": 6.647163391113281, + "kl": 0.2880859375, + "learning_rate": 6.556666666666666e-07, + "loss": 0.0115, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5165 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.25, + "epoch": 0.6888, + "grad_norm": 6.179965496063232, + "kl": 0.23681640625, + "learning_rate": 6.555999999999999e-07, + "loss": 0.0095, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5166 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.625, + "epoch": 0.6889333333333333, + "grad_norm": 6.664103984832764, + "kl": 0.1484375, + "learning_rate": 6.555333333333333e-07, + "loss": 0.0059, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5167 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.9375, + "epoch": 0.6890666666666667, + "grad_norm": 0.32279595732688904, + "kl": 0.19091796875, + "learning_rate": 6.554666666666666e-07, + "loss": 0.0076, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5168 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.0625, + "epoch": 0.6892, + "grad_norm": 3.6740710735321045, + "kl": 0.19921875, + "learning_rate": 6.554e-07, + "loss": 0.008, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5169 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.75, + "epoch": 0.6893333333333334, + "grad_norm": 6.173297882080078, + "kl": 0.51171875, + "learning_rate": 6.553333333333333e-07, + "loss": 0.0205, + "reward": 1.5625, + "reward_std": 0.4955156147480011, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.8125, + "step": 5170 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.5, + "epoch": 0.6894666666666667, + "grad_norm": 7.96581506729126, + "kl": 0.201171875, + "learning_rate": 6.552666666666666e-07, + "loss": 0.008, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5171 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.1875, + "epoch": 0.6896, + "grad_norm": 5.893706798553467, + "kl": 0.25048828125, + "learning_rate": 6.552e-07, + "loss": 0.01, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5172 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.8125, + "epoch": 0.6897333333333333, + "grad_norm": 8.026281356811523, + "kl": 0.1943359375, + "learning_rate": 6.551333333333333e-07, + "loss": 0.0078, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5173 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.25, + "epoch": 0.6898666666666666, + "grad_norm": 3.8028883934020996, + "kl": 0.2255859375, + "learning_rate": 6.550666666666667e-07, + "loss": 0.009, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5174 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.0, + "epoch": 0.69, + "grad_norm": 7.6956892013549805, + "kl": 0.25390625, + "learning_rate": 6.55e-07, + "loss": 0.0101, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5175 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.8125, + "epoch": 0.6901333333333334, + "grad_norm": 8.048882484436035, + "kl": 0.20703125, + "learning_rate": 6.549333333333334e-07, + "loss": 0.0083, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5176 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.9375, + "epoch": 0.6902666666666667, + "grad_norm": 6.629820346832275, + "kl": 0.240234375, + "learning_rate": 6.548666666666667e-07, + "loss": 0.0096, + "reward": 1.375, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5177 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.9375, + "epoch": 0.6904, + "grad_norm": 10.187685012817383, + "kl": 0.341796875, + "learning_rate": 6.548000000000001e-07, + "loss": 0.0137, + "reward": 1.5, + "reward_std": 0.7071067690849304, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 5178 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.6875, + "epoch": 0.6905333333333333, + "grad_norm": 7.307285785675049, + "kl": 0.15673828125, + "learning_rate": 6.547333333333333e-07, + "loss": 0.0063, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5179 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.125, + "epoch": 0.6906666666666667, + "grad_norm": 0.6003014445304871, + "kl": 0.28564453125, + "learning_rate": 6.546666666666665e-07, + "loss": 0.0114, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5180 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.25, + "epoch": 0.6908, + "grad_norm": 10.151338577270508, + "kl": 0.2451171875, + "learning_rate": 6.545999999999999e-07, + "loss": 0.0098, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5181 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.5, + "epoch": 0.6909333333333333, + "grad_norm": 0.36656197905540466, + "kl": 0.2216796875, + "learning_rate": 6.545333333333332e-07, + "loss": 0.0089, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5182 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.375, + "epoch": 0.6910666666666667, + "grad_norm": 6.8423566818237305, + "kl": 0.22265625, + "learning_rate": 6.544666666666666e-07, + "loss": 0.0089, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5183 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.0, + "epoch": 0.6912, + "grad_norm": 7.126684665679932, + "kl": 0.2138671875, + "learning_rate": 6.543999999999999e-07, + "loss": 0.0085, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5184 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.4375, + "epoch": 0.6913333333333334, + "grad_norm": 23.5327091217041, + "kl": 1.1962890625, + "learning_rate": 6.543333333333333e-07, + "loss": 0.048, + "reward": 1.4375, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5185 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.0625, + "epoch": 0.6914666666666667, + "grad_norm": 7.710832118988037, + "kl": 0.25439453125, + "learning_rate": 6.542666666666666e-07, + "loss": 0.0102, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5186 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.8125, + "epoch": 0.6916, + "grad_norm": 14.789640426635742, + "kl": 0.4384765625, + "learning_rate": 6.542e-07, + "loss": 0.0175, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5187 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.5, + "epoch": 0.6917333333333333, + "grad_norm": 10.534707069396973, + "kl": 0.208984375, + "learning_rate": 6.541333333333333e-07, + "loss": 0.0084, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5188 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.4375, + "epoch": 0.6918666666666666, + "grad_norm": 5.2748541831970215, + "kl": 0.2109375, + "learning_rate": 6.540666666666666e-07, + "loss": 0.0084, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 5189 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.5625, + "epoch": 0.692, + "grad_norm": 9.531292915344238, + "kl": 0.32421875, + "learning_rate": 6.54e-07, + "loss": 0.013, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.875, + "step": 5190 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.4375, + "epoch": 0.6921333333333334, + "grad_norm": 7.847801208496094, + "kl": 0.208984375, + "learning_rate": 6.539333333333333e-07, + "loss": 0.0084, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5191 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.625, + "epoch": 0.6922666666666667, + "grad_norm": 4.997392177581787, + "kl": 0.14990234375, + "learning_rate": 6.538666666666667e-07, + "loss": 0.006, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5192 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.0625, + "epoch": 0.6924, + "grad_norm": 28.443418502807617, + "kl": 0.1943359375, + "learning_rate": 6.538e-07, + "loss": 0.0078, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5193 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.4375, + "epoch": 0.6925333333333333, + "grad_norm": 7.840848445892334, + "kl": 0.1845703125, + "learning_rate": 6.537333333333334e-07, + "loss": 0.0074, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5194 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.375, + "epoch": 0.6926666666666667, + "grad_norm": 5.596564769744873, + "kl": 0.26611328125, + "learning_rate": 6.536666666666666e-07, + "loss": 0.0106, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5195 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.0625, + "epoch": 0.6928, + "grad_norm": 0.47864022850990295, + "kl": 0.2900390625, + "learning_rate": 6.536e-07, + "loss": 0.0116, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5196 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.6875, + "epoch": 0.6929333333333333, + "grad_norm": 7.434576988220215, + "kl": 0.16259765625, + "learning_rate": 6.535333333333333e-07, + "loss": 0.0065, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5197 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.0, + "epoch": 0.6930666666666667, + "grad_norm": 4.1063642501831055, + "kl": 0.296875, + "learning_rate": 6.534666666666666e-07, + "loss": 0.0119, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5198 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.0, + "epoch": 0.6932, + "grad_norm": 5.942289352416992, + "kl": 0.232421875, + "learning_rate": 6.534e-07, + "loss": 0.0093, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5199 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.75, + "epoch": 0.6933333333333334, + "grad_norm": 8.673123359680176, + "kl": 0.427734375, + "learning_rate": 6.533333333333333e-07, + "loss": 0.0171, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5200 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.5625, + "epoch": 0.6934666666666667, + "grad_norm": 5.266241550445557, + "kl": 0.19482421875, + "learning_rate": 6.532666666666667e-07, + "loss": 0.0078, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5201 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.8125, + "epoch": 0.6936, + "grad_norm": 0.4820953905582428, + "kl": 0.3427734375, + "learning_rate": 6.531999999999999e-07, + "loss": 0.0137, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5202 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.6875, + "epoch": 0.6937333333333333, + "grad_norm": 8.235859870910645, + "kl": 0.22509765625, + "learning_rate": 6.531333333333333e-07, + "loss": 0.009, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5203 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.3125, + "epoch": 0.6938666666666666, + "grad_norm": 5.754523277282715, + "kl": 0.26220703125, + "learning_rate": 6.530666666666666e-07, + "loss": 0.0105, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5204 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.0, + "epoch": 0.694, + "grad_norm": 36.14334487915039, + "kl": 0.1708984375, + "learning_rate": 6.53e-07, + "loss": 0.0068, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5205 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.5, + "epoch": 0.6941333333333334, + "grad_norm": 6.911557674407959, + "kl": 0.189453125, + "learning_rate": 6.529333333333333e-07, + "loss": 0.0076, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5206 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.625, + "epoch": 0.6942666666666667, + "grad_norm": 4.929715633392334, + "kl": 0.23974609375, + "learning_rate": 6.528666666666667e-07, + "loss": 0.0096, + "reward": 1.6875, + "reward_std": 0.6396867483854294, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.875, + "step": 5207 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.6875, + "epoch": 0.6944, + "grad_norm": 6.17883825302124, + "kl": 0.17919921875, + "learning_rate": 6.528e-07, + "loss": 0.0072, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 5208 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.0625, + "epoch": 0.6945333333333333, + "grad_norm": 7.897099494934082, + "kl": 0.23779296875, + "learning_rate": 6.527333333333333e-07, + "loss": 0.0095, + "reward": 1.0625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 5209 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.375, + "epoch": 0.6946666666666667, + "grad_norm": 6.92312479019165, + "kl": 0.2705078125, + "learning_rate": 6.526666666666666e-07, + "loss": 0.0108, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5210 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.125, + "epoch": 0.6948, + "grad_norm": 7.032107353210449, + "kl": 0.1962890625, + "learning_rate": 6.525999999999999e-07, + "loss": 0.0079, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 5211 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.625, + "epoch": 0.6949333333333333, + "grad_norm": 8.58906364440918, + "kl": 0.2138671875, + "learning_rate": 6.525333333333333e-07, + "loss": 0.0085, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5212 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.1875, + "epoch": 0.6950666666666667, + "grad_norm": 7.374428749084473, + "kl": 0.2060546875, + "learning_rate": 6.524666666666666e-07, + "loss": 0.0082, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5213 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.3125, + "epoch": 0.6952, + "grad_norm": 5.190303802490234, + "kl": 0.22802734375, + "learning_rate": 6.524e-07, + "loss": 0.0091, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5214 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.25, + "epoch": 0.6953333333333334, + "grad_norm": 8.389266967773438, + "kl": 0.291015625, + "learning_rate": 6.523333333333333e-07, + "loss": 0.0116, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 5215 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.5, + "epoch": 0.6954666666666667, + "grad_norm": 8.35036849975586, + "kl": 0.2197265625, + "learning_rate": 6.522666666666667e-07, + "loss": 0.0088, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5216 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.0625, + "epoch": 0.6956, + "grad_norm": 4.471190929412842, + "kl": 0.1953125, + "learning_rate": 6.522e-07, + "loss": 0.0078, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5217 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.6875, + "epoch": 0.6957333333333333, + "grad_norm": 12.313729286193848, + "kl": 0.18994140625, + "learning_rate": 6.521333333333333e-07, + "loss": 0.0076, + "reward": 1.1875, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 5218 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.4375, + "epoch": 0.6958666666666666, + "grad_norm": 8.940138816833496, + "kl": 0.19677734375, + "learning_rate": 6.520666666666667e-07, + "loss": 0.0079, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 5219 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.6875, + "epoch": 0.696, + "grad_norm": 5.940279483795166, + "kl": 0.19482421875, + "learning_rate": 6.52e-07, + "loss": 0.0078, + "reward": 1.25, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 5220 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.25, + "epoch": 0.6961333333333334, + "grad_norm": 5.609799861907959, + "kl": 0.24462890625, + "learning_rate": 6.519333333333334e-07, + "loss": 0.0098, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5221 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.25, + "epoch": 0.6962666666666667, + "grad_norm": 14.419145584106445, + "kl": 0.4990234375, + "learning_rate": 6.518666666666667e-07, + "loss": 0.0199, + "reward": 1.375, + "reward_std": 0.7315178513526917, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.8125, + "step": 5222 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.125, + "epoch": 0.6964, + "grad_norm": 7.982038497924805, + "kl": 0.2060546875, + "learning_rate": 6.518e-07, + "loss": 0.0083, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5223 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.1875, + "epoch": 0.6965333333333333, + "grad_norm": 7.542389392852783, + "kl": 0.294921875, + "learning_rate": 6.517333333333333e-07, + "loss": 0.0118, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 5224 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.0625, + "epoch": 0.6966666666666667, + "grad_norm": 4.75694465637207, + "kl": 0.2353515625, + "learning_rate": 6.516666666666666e-07, + "loss": 0.0094, + "reward": 1.0, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.875, + "step": 5225 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.75, + "epoch": 0.6968, + "grad_norm": 5.359249591827393, + "kl": 0.173828125, + "learning_rate": 6.515999999999999e-07, + "loss": 0.0069, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5226 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.625, + "epoch": 0.6969333333333333, + "grad_norm": 9.68712329864502, + "kl": 0.17724609375, + "learning_rate": 6.515333333333332e-07, + "loss": 0.0071, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5227 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.6875, + "epoch": 0.6970666666666666, + "grad_norm": 18.48320960998535, + "kl": 0.22412109375, + "learning_rate": 6.514666666666666e-07, + "loss": 0.009, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5228 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.0625, + "epoch": 0.6972, + "grad_norm": 5.371661186218262, + "kl": 0.24169921875, + "learning_rate": 6.513999999999999e-07, + "loss": 0.0096, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5229 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.8125, + "epoch": 0.6973333333333334, + "grad_norm": 10.657188415527344, + "kl": 0.1806640625, + "learning_rate": 6.513333333333333e-07, + "loss": 0.0072, + "reward": 1.3125, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 5230 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.8125, + "epoch": 0.6974666666666667, + "grad_norm": 4.794307231903076, + "kl": 0.24365234375, + "learning_rate": 6.512666666666666e-07, + "loss": 0.0097, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5231 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.0, + "epoch": 0.6976, + "grad_norm": 8.704111099243164, + "kl": 0.2099609375, + "learning_rate": 6.512e-07, + "loss": 0.0084, + "reward": 1.25, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 5232 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.0625, + "epoch": 0.6977333333333333, + "grad_norm": 7.031347751617432, + "kl": 0.19921875, + "learning_rate": 6.511333333333333e-07, + "loss": 0.0079, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 5233 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.5625, + "epoch": 0.6978666666666666, + "grad_norm": 7.703577041625977, + "kl": 0.2578125, + "learning_rate": 6.510666666666667e-07, + "loss": 0.0103, + "reward": 1.5625, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 5234 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.3125, + "epoch": 0.698, + "grad_norm": 0.624920666217804, + "kl": 0.1669921875, + "learning_rate": 6.51e-07, + "loss": 0.0067, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5235 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.25, + "epoch": 0.6981333333333334, + "grad_norm": 5.410050868988037, + "kl": 0.17236328125, + "learning_rate": 6.509333333333333e-07, + "loss": 0.0069, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 5236 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.5625, + "epoch": 0.6982666666666667, + "grad_norm": 10.179576873779297, + "kl": 0.22119140625, + "learning_rate": 6.508666666666667e-07, + "loss": 0.0089, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 5237 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.75, + "epoch": 0.6984, + "grad_norm": 8.2864351272583, + "kl": 0.2685546875, + "learning_rate": 6.508e-07, + "loss": 0.0107, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 5238 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.8125, + "epoch": 0.6985333333333333, + "grad_norm": 3.6091268062591553, + "kl": 0.259765625, + "learning_rate": 6.507333333333334e-07, + "loss": 0.0104, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5239 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.75, + "epoch": 0.6986666666666667, + "grad_norm": 6.967354774475098, + "kl": 0.2392578125, + "learning_rate": 6.506666666666666e-07, + "loss": 0.0096, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5240 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.125, + "epoch": 0.6988, + "grad_norm": 9.423833847045898, + "kl": 0.537109375, + "learning_rate": 6.506e-07, + "loss": 0.0215, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 5241 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.625, + "epoch": 0.6989333333333333, + "grad_norm": 6.59601354598999, + "kl": 0.25146484375, + "learning_rate": 6.505333333333333e-07, + "loss": 0.0101, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5242 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.375, + "epoch": 0.6990666666666666, + "grad_norm": 7.510486602783203, + "kl": 0.2236328125, + "learning_rate": 6.504666666666667e-07, + "loss": 0.009, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5243 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.0, + "epoch": 0.6992, + "grad_norm": 20.02863311767578, + "kl": 0.3017578125, + "learning_rate": 6.504e-07, + "loss": 0.0121, + "reward": 1.6875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 5244 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.125, + "epoch": 0.6993333333333334, + "grad_norm": 7.0422492027282715, + "kl": 0.2568359375, + "learning_rate": 6.503333333333332e-07, + "loss": 0.0103, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5245 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.75, + "epoch": 0.6994666666666667, + "grad_norm": 7.995306491851807, + "kl": 0.21923828125, + "learning_rate": 6.502666666666666e-07, + "loss": 0.0088, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5246 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.25, + "epoch": 0.6996, + "grad_norm": 6.953773021697998, + "kl": 0.1865234375, + "learning_rate": 6.501999999999999e-07, + "loss": 0.0075, + "reward": 1.3125, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 5247 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.6875, + "epoch": 0.6997333333333333, + "grad_norm": 5.610381126403809, + "kl": 0.4140625, + "learning_rate": 6.501333333333333e-07, + "loss": 0.0165, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5248 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.0, + "epoch": 0.6998666666666666, + "grad_norm": 9.166762351989746, + "kl": 0.30078125, + "learning_rate": 6.500666666666666e-07, + "loss": 0.012, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5249 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.1875, + "epoch": 0.7, + "grad_norm": 10.225579261779785, + "kl": 0.3310546875, + "learning_rate": 6.5e-07, + "loss": 0.0132, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5250 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.625, + "epoch": 0.7001333333333334, + "grad_norm": 6.293673992156982, + "kl": 0.22265625, + "learning_rate": 6.499333333333333e-07, + "loss": 0.0089, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5251 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.875, + "epoch": 0.7002666666666667, + "grad_norm": 8.980913162231445, + "kl": 0.2998046875, + "learning_rate": 6.498666666666667e-07, + "loss": 0.012, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5252 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.6875, + "epoch": 0.7004, + "grad_norm": 7.959515571594238, + "kl": 0.240234375, + "learning_rate": 6.498e-07, + "loss": 0.0096, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5253 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.25, + "epoch": 0.7005333333333333, + "grad_norm": 12.081365585327148, + "kl": 0.41015625, + "learning_rate": 6.497333333333334e-07, + "loss": 0.0164, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 5254 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.6875, + "epoch": 0.7006666666666667, + "grad_norm": 0.7088748216629028, + "kl": 0.376953125, + "learning_rate": 6.496666666666666e-07, + "loss": 0.0151, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 5255 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.5, + "epoch": 0.7008, + "grad_norm": 9.797647476196289, + "kl": 0.255859375, + "learning_rate": 6.495999999999999e-07, + "loss": 0.0102, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 5256 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.0625, + "epoch": 0.7009333333333333, + "grad_norm": 16.047306060791016, + "kl": 0.2724609375, + "learning_rate": 6.495333333333333e-07, + "loss": 0.0109, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5257 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.375, + "epoch": 0.7010666666666666, + "grad_norm": 6.181591510772705, + "kl": 0.2607421875, + "learning_rate": 6.494666666666666e-07, + "loss": 0.0104, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5258 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.1875, + "epoch": 0.7012, + "grad_norm": 4.550396919250488, + "kl": 0.359375, + "learning_rate": 6.494e-07, + "loss": 0.0144, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 5259 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.9375, + "epoch": 0.7013333333333334, + "grad_norm": 0.4998016357421875, + "kl": 0.2724609375, + "learning_rate": 6.493333333333333e-07, + "loss": 0.0109, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5260 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.375, + "epoch": 0.7014666666666667, + "grad_norm": 16.554407119750977, + "kl": 0.318359375, + "learning_rate": 6.492666666666667e-07, + "loss": 0.0127, + "reward": 1.375, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5261 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.8125, + "epoch": 0.7016, + "grad_norm": 5.827136039733887, + "kl": 0.416015625, + "learning_rate": 6.492e-07, + "loss": 0.0166, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5262 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.3125, + "epoch": 0.7017333333333333, + "grad_norm": 8.165949821472168, + "kl": 0.28173828125, + "learning_rate": 6.491333333333334e-07, + "loss": 0.0112, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 5263 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.8125, + "epoch": 0.7018666666666666, + "grad_norm": 8.825247764587402, + "kl": 0.208984375, + "learning_rate": 6.490666666666667e-07, + "loss": 0.0083, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5264 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.875, + "epoch": 0.702, + "grad_norm": 4.237284183502197, + "kl": 0.197265625, + "learning_rate": 6.49e-07, + "loss": 0.0079, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 5265 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.125, + "epoch": 0.7021333333333334, + "grad_norm": 5.063480377197266, + "kl": 0.27197265625, + "learning_rate": 6.489333333333333e-07, + "loss": 0.0109, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5266 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.25, + "epoch": 0.7022666666666667, + "grad_norm": 8.19227409362793, + "kl": 0.3271484375, + "learning_rate": 6.488666666666666e-07, + "loss": 0.0131, + "reward": 1.1875, + "reward_std": 0.5876962244510651, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 5267 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.5, + "epoch": 0.7024, + "grad_norm": 6.878514289855957, + "kl": 0.275390625, + "learning_rate": 6.488e-07, + "loss": 0.011, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5268 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.625, + "epoch": 0.7025333333333333, + "grad_norm": 9.809721946716309, + "kl": 0.22265625, + "learning_rate": 6.487333333333333e-07, + "loss": 0.0089, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5269 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.25, + "epoch": 0.7026666666666667, + "grad_norm": 8.691244125366211, + "kl": 0.27099609375, + "learning_rate": 6.486666666666666e-07, + "loss": 0.0109, + "reward": 1.4375, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5270 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.75, + "epoch": 0.7028, + "grad_norm": 8.00822925567627, + "kl": 0.3466796875, + "learning_rate": 6.485999999999999e-07, + "loss": 0.0139, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5271 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.75, + "epoch": 0.7029333333333333, + "grad_norm": 5.565573215484619, + "kl": 0.208984375, + "learning_rate": 6.485333333333333e-07, + "loss": 0.0084, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 5272 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.125, + "epoch": 0.7030666666666666, + "grad_norm": 6.538054943084717, + "kl": 0.26318359375, + "learning_rate": 6.484666666666666e-07, + "loss": 0.0105, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5273 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.375, + "epoch": 0.7032, + "grad_norm": 4.708321571350098, + "kl": 0.21044921875, + "learning_rate": 6.483999999999999e-07, + "loss": 0.0084, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5274 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.1875, + "epoch": 0.7033333333333334, + "grad_norm": 6.264973163604736, + "kl": 0.31640625, + "learning_rate": 6.483333333333333e-07, + "loss": 0.0127, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5275 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.75, + "epoch": 0.7034666666666667, + "grad_norm": 9.101787567138672, + "kl": 0.16064453125, + "learning_rate": 6.482666666666666e-07, + "loss": 0.0064, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5276 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.0, + "epoch": 0.7036, + "grad_norm": 7.84062385559082, + "kl": 0.25244140625, + "learning_rate": 6.482e-07, + "loss": 0.0101, + "reward": 1.3125, + "reward_std": 0.6845227181911469, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 5277 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.625, + "epoch": 0.7037333333333333, + "grad_norm": 6.811967372894287, + "kl": 0.16455078125, + "learning_rate": 6.481333333333333e-07, + "loss": 0.0066, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5278 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.8125, + "epoch": 0.7038666666666666, + "grad_norm": 8.816990852355957, + "kl": 0.23779296875, + "learning_rate": 6.480666666666667e-07, + "loss": 0.0095, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5279 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.5, + "epoch": 0.704, + "grad_norm": 4.402215957641602, + "kl": 0.23779296875, + "learning_rate": 6.48e-07, + "loss": 0.0095, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5280 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.375, + "epoch": 0.7041333333333334, + "grad_norm": 0.471862256526947, + "kl": 0.20166015625, + "learning_rate": 6.479333333333334e-07, + "loss": 0.0081, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5281 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.1875, + "epoch": 0.7042666666666667, + "grad_norm": 7.284163475036621, + "kl": 0.2548828125, + "learning_rate": 6.478666666666667e-07, + "loss": 0.0102, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5282 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.0, + "epoch": 0.7044, + "grad_norm": 8.498963356018066, + "kl": 0.22119140625, + "learning_rate": 6.478e-07, + "loss": 0.0089, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 5283 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.5625, + "epoch": 0.7045333333333333, + "grad_norm": 4.155941009521484, + "kl": 0.130859375, + "learning_rate": 6.477333333333334e-07, + "loss": 0.0052, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5284 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.875, + "epoch": 0.7046666666666667, + "grad_norm": 6.668773651123047, + "kl": 0.2001953125, + "learning_rate": 6.476666666666666e-07, + "loss": 0.008, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 5285 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.125, + "epoch": 0.7048, + "grad_norm": 9.806303977966309, + "kl": 0.37890625, + "learning_rate": 6.476e-07, + "loss": 0.0152, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 5286 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.8125, + "epoch": 0.7049333333333333, + "grad_norm": 8.445038795471191, + "kl": 0.2314453125, + "learning_rate": 6.475333333333332e-07, + "loss": 0.0093, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 5287 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.3125, + "epoch": 0.7050666666666666, + "grad_norm": 9.934020042419434, + "kl": 0.4072265625, + "learning_rate": 6.474666666666666e-07, + "loss": 0.0163, + "reward": 1.25, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 5288 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.0625, + "epoch": 0.7052, + "grad_norm": 4.904730796813965, + "kl": 0.2255859375, + "learning_rate": 6.473999999999999e-07, + "loss": 0.009, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5289 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.625, + "epoch": 0.7053333333333334, + "grad_norm": 9.152312278747559, + "kl": 0.255859375, + "learning_rate": 6.473333333333333e-07, + "loss": 0.0102, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5290 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.5625, + "epoch": 0.7054666666666667, + "grad_norm": 7.464629173278809, + "kl": 0.2060546875, + "learning_rate": 6.472666666666666e-07, + "loss": 0.0082, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5291 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.3125, + "epoch": 0.7056, + "grad_norm": 6.436700820922852, + "kl": 0.240234375, + "learning_rate": 6.471999999999999e-07, + "loss": 0.0096, + "reward": 1.5625, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5292 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.25, + "epoch": 0.7057333333333333, + "grad_norm": 5.214740753173828, + "kl": 0.1923828125, + "learning_rate": 6.471333333333333e-07, + "loss": 0.0077, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5293 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.3125, + "epoch": 0.7058666666666666, + "grad_norm": 8.029158592224121, + "kl": 0.1630859375, + "learning_rate": 6.470666666666666e-07, + "loss": 0.0065, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5294 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.625, + "epoch": 0.706, + "grad_norm": 6.148681163787842, + "kl": 0.3125, + "learning_rate": 6.47e-07, + "loss": 0.0125, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5295 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.875, + "epoch": 0.7061333333333333, + "grad_norm": 7.9123358726501465, + "kl": 0.15478515625, + "learning_rate": 6.469333333333333e-07, + "loss": 0.0062, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5296 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.8125, + "epoch": 0.7062666666666667, + "grad_norm": 6.168305397033691, + "kl": 0.21435546875, + "learning_rate": 6.468666666666667e-07, + "loss": 0.0086, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5297 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.625, + "epoch": 0.7064, + "grad_norm": 9.192083358764648, + "kl": 0.2373046875, + "learning_rate": 6.468e-07, + "loss": 0.0095, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 5298 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.375, + "epoch": 0.7065333333333333, + "grad_norm": 6.785309791564941, + "kl": 0.169921875, + "learning_rate": 6.467333333333334e-07, + "loss": 0.0068, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5299 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.9375, + "epoch": 0.7066666666666667, + "grad_norm": 8.139331817626953, + "kl": 0.478515625, + "learning_rate": 6.466666666666666e-07, + "loss": 0.0191, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 5300 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.0625, + "epoch": 0.7068, + "grad_norm": 9.961462020874023, + "kl": 0.2958984375, + "learning_rate": 6.465999999999999e-07, + "loss": 0.0118, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5301 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.4375, + "epoch": 0.7069333333333333, + "grad_norm": 11.605016708374023, + "kl": 0.853515625, + "learning_rate": 6.465333333333333e-07, + "loss": 0.0341, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.875, + "step": 5302 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.75, + "epoch": 0.7070666666666666, + "grad_norm": 6.590375900268555, + "kl": 0.23828125, + "learning_rate": 6.464666666666666e-07, + "loss": 0.0095, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5303 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.125, + "epoch": 0.7072, + "grad_norm": 8.035374641418457, + "kl": 0.1767578125, + "learning_rate": 6.464e-07, + "loss": 0.0071, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5304 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.5, + "epoch": 0.7073333333333334, + "grad_norm": 10.316061019897461, + "kl": 0.271484375, + "learning_rate": 6.463333333333333e-07, + "loss": 0.0109, + "reward": 1.4375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 5305 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.0625, + "epoch": 0.7074666666666667, + "grad_norm": 6.674719333648682, + "kl": 0.153564453125, + "learning_rate": 6.462666666666667e-07, + "loss": 0.0062, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5306 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.25, + "epoch": 0.7076, + "grad_norm": 4.634714126586914, + "kl": 0.22216796875, + "learning_rate": 6.462e-07, + "loss": 0.0089, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5307 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.5, + "epoch": 0.7077333333333333, + "grad_norm": 12.548050880432129, + "kl": 0.548828125, + "learning_rate": 6.461333333333333e-07, + "loss": 0.022, + "reward": 1.3125, + "reward_std": 0.7499763667583466, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.8125, + "step": 5308 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.5625, + "epoch": 0.7078666666666666, + "grad_norm": 5.331404685974121, + "kl": 0.3369140625, + "learning_rate": 6.460666666666666e-07, + "loss": 0.0135, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5309 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.75, + "epoch": 0.708, + "grad_norm": 9.215402603149414, + "kl": 0.248046875, + "learning_rate": 6.46e-07, + "loss": 0.0099, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5310 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.9375, + "epoch": 0.7081333333333333, + "grad_norm": 5.366791725158691, + "kl": 0.275390625, + "learning_rate": 6.459333333333333e-07, + "loss": 0.011, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5311 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.1875, + "epoch": 0.7082666666666667, + "grad_norm": 10.485660552978516, + "kl": 0.35302734375, + "learning_rate": 6.458666666666666e-07, + "loss": 0.0141, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5312 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.0, + "epoch": 0.7084, + "grad_norm": 17.223661422729492, + "kl": 0.2861328125, + "learning_rate": 6.458e-07, + "loss": 0.0115, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5313 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.625, + "epoch": 0.7085333333333333, + "grad_norm": 8.558711051940918, + "kl": 0.25927734375, + "learning_rate": 6.457333333333333e-07, + "loss": 0.0104, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5314 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.125, + "epoch": 0.7086666666666667, + "grad_norm": 5.4294891357421875, + "kl": 0.279296875, + "learning_rate": 6.456666666666667e-07, + "loss": 0.0111, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5315 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.125, + "epoch": 0.7088, + "grad_norm": 7.291904926300049, + "kl": 0.15869140625, + "learning_rate": 6.455999999999999e-07, + "loss": 0.0063, + "reward": 1.5625, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5316 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.0, + "epoch": 0.7089333333333333, + "grad_norm": 5.8384294509887695, + "kl": 0.16455078125, + "learning_rate": 6.455333333333333e-07, + "loss": 0.0066, + "reward": 1.4375, + "reward_std": 0.7019771933555603, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 5317 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.25, + "epoch": 0.7090666666666666, + "grad_norm": 8.050628662109375, + "kl": 0.19970703125, + "learning_rate": 6.454666666666666e-07, + "loss": 0.008, + "reward": 1.0625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 5318 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.1875, + "epoch": 0.7092, + "grad_norm": 4.230269908905029, + "kl": 0.22509765625, + "learning_rate": 6.454e-07, + "loss": 0.009, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5319 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.25, + "epoch": 0.7093333333333334, + "grad_norm": 0.30887138843536377, + "kl": 0.16650390625, + "learning_rate": 6.453333333333333e-07, + "loss": 0.0067, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5320 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.0, + "epoch": 0.7094666666666667, + "grad_norm": 0.4641127288341522, + "kl": 0.240234375, + "learning_rate": 6.452666666666666e-07, + "loss": 0.0096, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5321 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.0, + "epoch": 0.7096, + "grad_norm": 4.629668712615967, + "kl": 0.20703125, + "learning_rate": 6.452e-07, + "loss": 0.0083, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5322 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.1875, + "epoch": 0.7097333333333333, + "grad_norm": 8.037790298461914, + "kl": 0.19970703125, + "learning_rate": 6.451333333333333e-07, + "loss": 0.008, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5323 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.125, + "epoch": 0.7098666666666666, + "grad_norm": 10.066925048828125, + "kl": 0.3818359375, + "learning_rate": 6.450666666666667e-07, + "loss": 0.0153, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 5324 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.1875, + "epoch": 0.71, + "grad_norm": 9.363527297973633, + "kl": 0.2822265625, + "learning_rate": 6.45e-07, + "loss": 0.0113, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5325 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.125, + "epoch": 0.7101333333333333, + "grad_norm": 9.29017162322998, + "kl": 0.162109375, + "learning_rate": 6.449333333333334e-07, + "loss": 0.0065, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5326 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.25, + "epoch": 0.7102666666666667, + "grad_norm": 10.992045402526855, + "kl": 0.36962890625, + "learning_rate": 6.448666666666667e-07, + "loss": 0.0148, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 5327 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.1875, + "epoch": 0.7104, + "grad_norm": 10.856014251708984, + "kl": 0.41162109375, + "learning_rate": 6.448000000000001e-07, + "loss": 0.0165, + "reward": 1.4375, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5328 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.0, + "epoch": 0.7105333333333334, + "grad_norm": 4.750816822052002, + "kl": 0.19677734375, + "learning_rate": 6.447333333333333e-07, + "loss": 0.0079, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5329 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.0625, + "epoch": 0.7106666666666667, + "grad_norm": 7.248255729675293, + "kl": 0.181640625, + "learning_rate": 6.446666666666666e-07, + "loss": 0.0073, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5330 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.3125, + "epoch": 0.7108, + "grad_norm": 9.408876419067383, + "kl": 0.3720703125, + "learning_rate": 6.445999999999999e-07, + "loss": 0.0149, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 5331 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.125, + "epoch": 0.7109333333333333, + "grad_norm": 39.42374801635742, + "kl": 0.7998046875, + "learning_rate": 6.445333333333332e-07, + "loss": 0.0319, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 5332 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.5625, + "epoch": 0.7110666666666666, + "grad_norm": 0.7763947248458862, + "kl": 0.298828125, + "learning_rate": 6.444666666666666e-07, + "loss": 0.0119, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5333 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.625, + "epoch": 0.7112, + "grad_norm": 4.832509994506836, + "kl": 0.1298828125, + "learning_rate": 6.443999999999999e-07, + "loss": 0.0052, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5334 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.9375, + "epoch": 0.7113333333333334, + "grad_norm": 4.60767936706543, + "kl": 0.228515625, + "learning_rate": 6.443333333333333e-07, + "loss": 0.0091, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5335 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.125, + "epoch": 0.7114666666666667, + "grad_norm": 0.4571681320667267, + "kl": 0.203125, + "learning_rate": 6.442666666666666e-07, + "loss": 0.0081, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5336 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.6875, + "epoch": 0.7116, + "grad_norm": 6.805804252624512, + "kl": 0.29248046875, + "learning_rate": 6.442e-07, + "loss": 0.0117, + "reward": 1.5625, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5337 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.625, + "epoch": 0.7117333333333333, + "grad_norm": 0.384215772151947, + "kl": 0.18017578125, + "learning_rate": 6.441333333333333e-07, + "loss": 0.0072, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 5338 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.75, + "epoch": 0.7118666666666666, + "grad_norm": 9.872032165527344, + "kl": 0.171875, + "learning_rate": 6.440666666666666e-07, + "loss": 0.0069, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5339 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.0625, + "epoch": 0.712, + "grad_norm": 5.966821670532227, + "kl": 0.169921875, + "learning_rate": 6.44e-07, + "loss": 0.0068, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5340 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.4375, + "epoch": 0.7121333333333333, + "grad_norm": 10.286911010742188, + "kl": 0.28515625, + "learning_rate": 6.439333333333333e-07, + "loss": 0.0114, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5341 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.8125, + "epoch": 0.7122666666666667, + "grad_norm": 7.1788649559021, + "kl": 0.20361328125, + "learning_rate": 6.438666666666667e-07, + "loss": 0.0081, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5342 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.6875, + "epoch": 0.7124, + "grad_norm": 4.101319313049316, + "kl": 0.190673828125, + "learning_rate": 6.438e-07, + "loss": 0.0076, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5343 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.5, + "epoch": 0.7125333333333334, + "grad_norm": 4.953159809112549, + "kl": 0.251953125, + "learning_rate": 6.437333333333334e-07, + "loss": 0.0101, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5344 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.875, + "epoch": 0.7126666666666667, + "grad_norm": 5.638945579528809, + "kl": 0.23779296875, + "learning_rate": 6.436666666666667e-07, + "loss": 0.0095, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 5345 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.3125, + "epoch": 0.7128, + "grad_norm": 8.155220031738281, + "kl": 0.169921875, + "learning_rate": 6.436e-07, + "loss": 0.0068, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5346 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.8125, + "epoch": 0.7129333333333333, + "grad_norm": 6.236475944519043, + "kl": 0.294921875, + "learning_rate": 6.435333333333333e-07, + "loss": 0.0118, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 5347 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.8125, + "epoch": 0.7130666666666666, + "grad_norm": 3.9478790760040283, + "kl": 0.1826171875, + "learning_rate": 6.434666666666666e-07, + "loss": 0.0073, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5348 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.375, + "epoch": 0.7132, + "grad_norm": 3.3915603160858154, + "kl": 0.202392578125, + "learning_rate": 6.434e-07, + "loss": 0.0081, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5349 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.625, + "epoch": 0.7133333333333334, + "grad_norm": 7.539703845977783, + "kl": 0.22265625, + "learning_rate": 6.433333333333332e-07, + "loss": 0.0089, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5350 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.3125, + "epoch": 0.7134666666666667, + "grad_norm": 7.3763861656188965, + "kl": 0.1669921875, + "learning_rate": 6.432666666666666e-07, + "loss": 0.0067, + "reward": 1.125, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.875, + "step": 5351 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.9375, + "epoch": 0.7136, + "grad_norm": 6.297537326812744, + "kl": 0.26953125, + "learning_rate": 6.431999999999999e-07, + "loss": 0.0108, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 5352 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.5, + "epoch": 0.7137333333333333, + "grad_norm": 8.404603958129883, + "kl": 0.18701171875, + "learning_rate": 6.431333333333333e-07, + "loss": 0.0075, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5353 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.0625, + "epoch": 0.7138666666666666, + "grad_norm": 6.797259330749512, + "kl": 0.193359375, + "learning_rate": 6.430666666666666e-07, + "loss": 0.0077, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5354 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.25, + "epoch": 0.714, + "grad_norm": 5.69990348815918, + "kl": 0.16162109375, + "learning_rate": 6.43e-07, + "loss": 0.0065, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5355 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.1875, + "epoch": 0.7141333333333333, + "grad_norm": 5.92965841293335, + "kl": 0.19287109375, + "learning_rate": 6.429333333333333e-07, + "loss": 0.0077, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5356 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.625, + "epoch": 0.7142666666666667, + "grad_norm": 3.4238364696502686, + "kl": 0.1494140625, + "learning_rate": 6.428666666666667e-07, + "loss": 0.006, + "reward": 1.25, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 5357 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.0625, + "epoch": 0.7144, + "grad_norm": 10.791539192199707, + "kl": 0.33935546875, + "learning_rate": 6.428e-07, + "loss": 0.0136, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5358 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.8125, + "epoch": 0.7145333333333334, + "grad_norm": 4.9467291831970215, + "kl": 0.21484375, + "learning_rate": 6.427333333333333e-07, + "loss": 0.0086, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5359 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.8125, + "epoch": 0.7146666666666667, + "grad_norm": 6.326785564422607, + "kl": 0.26513671875, + "learning_rate": 6.426666666666667e-07, + "loss": 0.0106, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 5360 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.75, + "epoch": 0.7148, + "grad_norm": 4.662775993347168, + "kl": 0.17626953125, + "learning_rate": 6.425999999999999e-07, + "loss": 0.007, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5361 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.375, + "epoch": 0.7149333333333333, + "grad_norm": 6.1780877113342285, + "kl": 0.20166015625, + "learning_rate": 6.425333333333333e-07, + "loss": 0.008, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5362 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.8125, + "epoch": 0.7150666666666666, + "grad_norm": 6.352311611175537, + "kl": 0.24267578125, + "learning_rate": 6.424666666666666e-07, + "loss": 0.0097, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5363 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.3125, + "epoch": 0.7152, + "grad_norm": 6.746992111206055, + "kl": 0.3232421875, + "learning_rate": 6.424e-07, + "loss": 0.0129, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 5364 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.75, + "epoch": 0.7153333333333334, + "grad_norm": 6.269286155700684, + "kl": 0.2294921875, + "learning_rate": 6.423333333333333e-07, + "loss": 0.0092, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5365 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.9375, + "epoch": 0.7154666666666667, + "grad_norm": 4.310076713562012, + "kl": 0.16357421875, + "learning_rate": 6.422666666666667e-07, + "loss": 0.0066, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5366 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.9375, + "epoch": 0.7156, + "grad_norm": 5.304470539093018, + "kl": 0.19384765625, + "learning_rate": 6.422e-07, + "loss": 0.0078, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5367 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.5625, + "epoch": 0.7157333333333333, + "grad_norm": 0.3851994574069977, + "kl": 0.1669921875, + "learning_rate": 6.421333333333333e-07, + "loss": 0.0067, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5368 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.875, + "epoch": 0.7158666666666667, + "grad_norm": 8.60792064666748, + "kl": 0.201171875, + "learning_rate": 6.420666666666667e-07, + "loss": 0.0081, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5369 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.6875, + "epoch": 0.716, + "grad_norm": 6.291331768035889, + "kl": 0.3212890625, + "learning_rate": 6.42e-07, + "loss": 0.0128, + "reward": 1.1875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 5370 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.0625, + "epoch": 0.7161333333333333, + "grad_norm": 8.716292381286621, + "kl": 0.22021484375, + "learning_rate": 6.419333333333334e-07, + "loss": 0.0088, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5371 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.0625, + "epoch": 0.7162666666666667, + "grad_norm": 6.687720775604248, + "kl": 0.314453125, + "learning_rate": 6.418666666666666e-07, + "loss": 0.0126, + "reward": 1.6875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 5372 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.9375, + "epoch": 0.7164, + "grad_norm": 4.3641037940979, + "kl": 0.16015625, + "learning_rate": 6.418e-07, + "loss": 0.0064, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5373 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.75, + "epoch": 0.7165333333333334, + "grad_norm": 9.342097282409668, + "kl": 0.22314453125, + "learning_rate": 6.417333333333333e-07, + "loss": 0.0089, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5374 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.0, + "epoch": 0.7166666666666667, + "grad_norm": 25.819969177246094, + "kl": 0.1494140625, + "learning_rate": 6.416666666666667e-07, + "loss": 0.006, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5375 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.4375, + "epoch": 0.7168, + "grad_norm": 7.754194736480713, + "kl": 0.19775390625, + "learning_rate": 6.415999999999999e-07, + "loss": 0.0079, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5376 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.8125, + "epoch": 0.7169333333333333, + "grad_norm": 4.698535442352295, + "kl": 0.16259765625, + "learning_rate": 6.415333333333332e-07, + "loss": 0.0065, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5377 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.1875, + "epoch": 0.7170666666666666, + "grad_norm": 5.843613147735596, + "kl": 0.1728515625, + "learning_rate": 6.414666666666666e-07, + "loss": 0.0069, + "reward": 1.3125, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 5378 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.125, + "epoch": 0.7172, + "grad_norm": 5.014617919921875, + "kl": 0.2607421875, + "learning_rate": 6.413999999999999e-07, + "loss": 0.0104, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 5379 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.75, + "epoch": 0.7173333333333334, + "grad_norm": 0.6591922044754028, + "kl": 0.27001953125, + "learning_rate": 6.413333333333333e-07, + "loss": 0.0108, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5380 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.5625, + "epoch": 0.7174666666666667, + "grad_norm": 5.257004737854004, + "kl": 0.18896484375, + "learning_rate": 6.412666666666666e-07, + "loss": 0.0075, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 5381 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.1875, + "epoch": 0.7176, + "grad_norm": 5.537173748016357, + "kl": 0.267578125, + "learning_rate": 6.412e-07, + "loss": 0.0107, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5382 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.1875, + "epoch": 0.7177333333333333, + "grad_norm": 0.2399178445339203, + "kl": 0.1552734375, + "learning_rate": 6.411333333333333e-07, + "loss": 0.0062, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 5383 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.6875, + "epoch": 0.7178666666666667, + "grad_norm": 5.597937107086182, + "kl": 0.2763671875, + "learning_rate": 6.410666666666667e-07, + "loss": 0.0111, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5384 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.4375, + "epoch": 0.718, + "grad_norm": 14.507187843322754, + "kl": 0.22705078125, + "learning_rate": 6.41e-07, + "loss": 0.0091, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 5385 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.8125, + "epoch": 0.7181333333333333, + "grad_norm": 7.732577800750732, + "kl": 0.18603515625, + "learning_rate": 6.409333333333333e-07, + "loss": 0.0074, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5386 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.5, + "epoch": 0.7182666666666667, + "grad_norm": 8.722092628479004, + "kl": 0.19287109375, + "learning_rate": 6.408666666666667e-07, + "loss": 0.0077, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5387 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.1875, + "epoch": 0.7184, + "grad_norm": 9.374804496765137, + "kl": 0.1826171875, + "learning_rate": 6.408e-07, + "loss": 0.0073, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5388 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.25, + "epoch": 0.7185333333333334, + "grad_norm": 7.530595779418945, + "kl": 0.203125, + "learning_rate": 6.407333333333334e-07, + "loss": 0.0081, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 5389 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.1875, + "epoch": 0.7186666666666667, + "grad_norm": 8.810234069824219, + "kl": 0.248046875, + "learning_rate": 6.406666666666667e-07, + "loss": 0.0099, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5390 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.5, + "epoch": 0.7188, + "grad_norm": 3.657900094985962, + "kl": 0.23779296875, + "learning_rate": 6.406e-07, + "loss": 0.0095, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5391 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.875, + "epoch": 0.7189333333333333, + "grad_norm": 4.791460037231445, + "kl": 0.2099609375, + "learning_rate": 6.405333333333332e-07, + "loss": 0.0084, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5392 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.625, + "epoch": 0.7190666666666666, + "grad_norm": 15.528572082519531, + "kl": 0.2021484375, + "learning_rate": 6.404666666666666e-07, + "loss": 0.0081, + "reward": 1.0625, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 5393 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.75, + "epoch": 0.7192, + "grad_norm": 7.998387813568115, + "kl": 0.265625, + "learning_rate": 6.403999999999999e-07, + "loss": 0.0106, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5394 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.375, + "epoch": 0.7193333333333334, + "grad_norm": 7.1479668617248535, + "kl": 0.2177734375, + "learning_rate": 6.403333333333332e-07, + "loss": 0.0087, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5395 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.875, + "epoch": 0.7194666666666667, + "grad_norm": 4.52498197555542, + "kl": 0.271484375, + "learning_rate": 6.402666666666666e-07, + "loss": 0.0108, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5396 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.5625, + "epoch": 0.7196, + "grad_norm": 1.657306432723999, + "kl": 0.271484375, + "learning_rate": 6.401999999999999e-07, + "loss": 0.0108, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5397 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.0625, + "epoch": 0.7197333333333333, + "grad_norm": 5.00120735168457, + "kl": 0.15576171875, + "learning_rate": 6.401333333333333e-07, + "loss": 0.0062, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5398 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.1875, + "epoch": 0.7198666666666667, + "grad_norm": 5.822322368621826, + "kl": 0.15380859375, + "learning_rate": 6.400666666666666e-07, + "loss": 0.0061, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5399 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.6875, + "epoch": 0.72, + "grad_norm": 5.1160478591918945, + "kl": 0.181640625, + "learning_rate": 6.4e-07, + "loss": 0.0073, + "reward": 1.25, + "reward_std": 0.6452257037162781, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 5400 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.3125, + "epoch": 0.7201333333333333, + "grad_norm": 6.652249336242676, + "kl": 0.17822265625, + "learning_rate": 6.399333333333333e-07, + "loss": 0.0071, + "reward": 1.5625, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5401 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.125, + "epoch": 0.7202666666666667, + "grad_norm": 4.6821393966674805, + "kl": 0.169921875, + "learning_rate": 6.398666666666667e-07, + "loss": 0.0068, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5402 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.9375, + "epoch": 0.7204, + "grad_norm": 7.13041353225708, + "kl": 0.32763671875, + "learning_rate": 6.398e-07, + "loss": 0.0131, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 5403 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.9375, + "epoch": 0.7205333333333334, + "grad_norm": 7.417537212371826, + "kl": 0.17822265625, + "learning_rate": 6.397333333333334e-07, + "loss": 0.0071, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5404 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.5, + "epoch": 0.7206666666666667, + "grad_norm": 4.040246963500977, + "kl": 0.2099609375, + "learning_rate": 6.396666666666667e-07, + "loss": 0.0084, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5405 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.125, + "epoch": 0.7208, + "grad_norm": 8.129838943481445, + "kl": 0.26123046875, + "learning_rate": 6.395999999999999e-07, + "loss": 0.0105, + "reward": 1.375, + "reward_std": 0.7892733812332153, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.8125, + "step": 5406 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.75, + "epoch": 0.7209333333333333, + "grad_norm": 8.592211723327637, + "kl": 0.29345703125, + "learning_rate": 6.395333333333333e-07, + "loss": 0.0117, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 5407 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.3125, + "epoch": 0.7210666666666666, + "grad_norm": 7.621591567993164, + "kl": 0.2197265625, + "learning_rate": 6.394666666666666e-07, + "loss": 0.0088, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5408 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.25, + "epoch": 0.7212, + "grad_norm": 6.75283145904541, + "kl": 0.21142578125, + "learning_rate": 6.394e-07, + "loss": 0.0085, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5409 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.5, + "epoch": 0.7213333333333334, + "grad_norm": 3.582919120788574, + "kl": 0.16650390625, + "learning_rate": 6.393333333333333e-07, + "loss": 0.0066, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 5410 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.625, + "epoch": 0.7214666666666667, + "grad_norm": 5.214814186096191, + "kl": 0.14453125, + "learning_rate": 6.392666666666667e-07, + "loss": 0.0058, + "reward": 1.3125, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 5411 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.0625, + "epoch": 0.7216, + "grad_norm": 9.345358848571777, + "kl": 0.23046875, + "learning_rate": 6.392e-07, + "loss": 0.0092, + "reward": 1.375, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5412 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.3125, + "epoch": 0.7217333333333333, + "grad_norm": 7.432713508605957, + "kl": 0.23095703125, + "learning_rate": 6.391333333333334e-07, + "loss": 0.0092, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5413 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.4375, + "epoch": 0.7218666666666667, + "grad_norm": 7.124879360198975, + "kl": 0.22509765625, + "learning_rate": 6.390666666666666e-07, + "loss": 0.009, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5414 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.25, + "epoch": 0.722, + "grad_norm": 8.680919647216797, + "kl": 0.2998046875, + "learning_rate": 6.389999999999999e-07, + "loss": 0.012, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 5415 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.625, + "epoch": 0.7221333333333333, + "grad_norm": 6.081610202789307, + "kl": 0.16796875, + "learning_rate": 6.389333333333333e-07, + "loss": 0.0067, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5416 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.125, + "epoch": 0.7222666666666666, + "grad_norm": 33.32294464111328, + "kl": 0.1669921875, + "learning_rate": 6.388666666666666e-07, + "loss": 0.0067, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5417 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.375, + "epoch": 0.7224, + "grad_norm": 8.232760429382324, + "kl": 0.30908203125, + "learning_rate": 6.388e-07, + "loss": 0.0124, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 5418 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.875, + "epoch": 0.7225333333333334, + "grad_norm": 7.732980728149414, + "kl": 0.2861328125, + "learning_rate": 6.387333333333333e-07, + "loss": 0.0114, + "reward": 1.3125, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 5419 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.9375, + "epoch": 0.7226666666666667, + "grad_norm": 8.671863555908203, + "kl": 0.1728515625, + "learning_rate": 6.386666666666667e-07, + "loss": 0.0069, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5420 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.5625, + "epoch": 0.7228, + "grad_norm": 12.923402786254883, + "kl": 0.30419921875, + "learning_rate": 6.385999999999999e-07, + "loss": 0.0122, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5421 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.0, + "epoch": 0.7229333333333333, + "grad_norm": 4.288008213043213, + "kl": 0.124267578125, + "learning_rate": 6.385333333333333e-07, + "loss": 0.005, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5422 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.75, + "epoch": 0.7230666666666666, + "grad_norm": 6.672684192657471, + "kl": 0.1943359375, + "learning_rate": 6.384666666666666e-07, + "loss": 0.0078, + "reward": 1.5625, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5423 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.6875, + "epoch": 0.7232, + "grad_norm": 7.241511344909668, + "kl": 0.177734375, + "learning_rate": 6.383999999999999e-07, + "loss": 0.0071, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5424 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.0625, + "epoch": 0.7233333333333334, + "grad_norm": 5.71403169631958, + "kl": 0.1796875, + "learning_rate": 6.383333333333333e-07, + "loss": 0.0072, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5425 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.0, + "epoch": 0.7234666666666667, + "grad_norm": 0.47540396451950073, + "kl": 0.29296875, + "learning_rate": 6.382666666666666e-07, + "loss": 0.0117, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5426 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.5, + "epoch": 0.7236, + "grad_norm": 14.189207077026367, + "kl": 0.20263671875, + "learning_rate": 6.382e-07, + "loss": 0.0081, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5427 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.9375, + "epoch": 0.7237333333333333, + "grad_norm": 5.873058795928955, + "kl": 0.16259765625, + "learning_rate": 6.381333333333333e-07, + "loss": 0.0065, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5428 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.25, + "epoch": 0.7238666666666667, + "grad_norm": 0.4411320090293884, + "kl": 0.2080078125, + "learning_rate": 6.380666666666667e-07, + "loss": 0.0083, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5429 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.3125, + "epoch": 0.724, + "grad_norm": 4.074389457702637, + "kl": 0.18603515625, + "learning_rate": 6.38e-07, + "loss": 0.0074, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5430 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.6875, + "epoch": 0.7241333333333333, + "grad_norm": 10.10517692565918, + "kl": 0.26220703125, + "learning_rate": 6.379333333333334e-07, + "loss": 0.0105, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5431 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.75, + "epoch": 0.7242666666666666, + "grad_norm": 0.3886842727661133, + "kl": 0.2255859375, + "learning_rate": 6.378666666666667e-07, + "loss": 0.009, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5432 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.1875, + "epoch": 0.7244, + "grad_norm": 5.87040901184082, + "kl": 0.1572265625, + "learning_rate": 6.378e-07, + "loss": 0.0063, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5433 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.8125, + "epoch": 0.7245333333333334, + "grad_norm": 4.350051403045654, + "kl": 0.1904296875, + "learning_rate": 6.377333333333334e-07, + "loss": 0.0076, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5434 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.0625, + "epoch": 0.7246666666666667, + "grad_norm": 5.660112380981445, + "kl": 0.201904296875, + "learning_rate": 6.376666666666666e-07, + "loss": 0.0081, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5435 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.75, + "epoch": 0.7248, + "grad_norm": 8.003389358520508, + "kl": 0.17138671875, + "learning_rate": 6.375999999999999e-07, + "loss": 0.0069, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5436 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.5625, + "epoch": 0.7249333333333333, + "grad_norm": 5.490379810333252, + "kl": 0.224609375, + "learning_rate": 6.375333333333332e-07, + "loss": 0.009, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5437 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.5, + "epoch": 0.7250666666666666, + "grad_norm": 3.3189501762390137, + "kl": 0.16259765625, + "learning_rate": 6.374666666666666e-07, + "loss": 0.0065, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 5438 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.5, + "epoch": 0.7252, + "grad_norm": 6.322231292724609, + "kl": 0.18017578125, + "learning_rate": 6.373999999999999e-07, + "loss": 0.0072, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5439 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.1875, + "epoch": 0.7253333333333334, + "grad_norm": 8.223359107971191, + "kl": 0.3330078125, + "learning_rate": 6.373333333333333e-07, + "loss": 0.0133, + "reward": 1.3125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 5440 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.875, + "epoch": 0.7254666666666667, + "grad_norm": 6.229926109313965, + "kl": 0.2568359375, + "learning_rate": 6.372666666666666e-07, + "loss": 0.0103, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5441 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.0, + "epoch": 0.7256, + "grad_norm": 9.091059684753418, + "kl": 0.28515625, + "learning_rate": 6.371999999999999e-07, + "loss": 0.0114, + "reward": 1.3125, + "reward_std": 0.6983994543552399, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 5442 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.0625, + "epoch": 0.7257333333333333, + "grad_norm": 38.314605712890625, + "kl": 0.169921875, + "learning_rate": 6.371333333333333e-07, + "loss": 0.0068, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5443 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.4375, + "epoch": 0.7258666666666667, + "grad_norm": 7.854360580444336, + "kl": 0.244140625, + "learning_rate": 6.370666666666666e-07, + "loss": 0.0097, + "reward": 1.4375, + "reward_std": 0.636739045381546, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5444 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.125, + "epoch": 0.726, + "grad_norm": 8.108772277832031, + "kl": 0.2001953125, + "learning_rate": 6.37e-07, + "loss": 0.008, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 5445 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.3125, + "epoch": 0.7261333333333333, + "grad_norm": 7.87504768371582, + "kl": 0.3037109375, + "learning_rate": 6.369333333333333e-07, + "loss": 0.0121, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5446 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.1875, + "epoch": 0.7262666666666666, + "grad_norm": 3.245859146118164, + "kl": 0.189208984375, + "learning_rate": 6.368666666666667e-07, + "loss": 0.0076, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5447 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.1875, + "epoch": 0.7264, + "grad_norm": 6.541783809661865, + "kl": 0.3193359375, + "learning_rate": 6.368e-07, + "loss": 0.0128, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 5448 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.1875, + "epoch": 0.7265333333333334, + "grad_norm": 5.804893970489502, + "kl": 0.251953125, + "learning_rate": 6.367333333333334e-07, + "loss": 0.0101, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5449 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.1875, + "epoch": 0.7266666666666667, + "grad_norm": 4.33439302444458, + "kl": 0.2099609375, + "learning_rate": 6.366666666666667e-07, + "loss": 0.0084, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5450 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.3125, + "epoch": 0.7268, + "grad_norm": 6.567450046539307, + "kl": 0.14794921875, + "learning_rate": 6.366000000000001e-07, + "loss": 0.0059, + "reward": 1.4375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5451 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.1875, + "epoch": 0.7269333333333333, + "grad_norm": 10.795382499694824, + "kl": 0.18603515625, + "learning_rate": 6.365333333333333e-07, + "loss": 0.0074, + "reward": 1.1875, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 5452 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.625, + "epoch": 0.7270666666666666, + "grad_norm": 12.556097984313965, + "kl": 0.22021484375, + "learning_rate": 6.364666666666666e-07, + "loss": 0.0088, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5453 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.25, + "epoch": 0.7272, + "grad_norm": 57.33848190307617, + "kl": 0.28125, + "learning_rate": 6.364e-07, + "loss": 0.0113, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5454 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.5, + "epoch": 0.7273333333333334, + "grad_norm": 9.160968780517578, + "kl": 0.14892578125, + "learning_rate": 6.363333333333332e-07, + "loss": 0.006, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5455 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.9375, + "epoch": 0.7274666666666667, + "grad_norm": 6.836593151092529, + "kl": 0.22314453125, + "learning_rate": 6.362666666666666e-07, + "loss": 0.0089, + "reward": 1.3125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 5456 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.4375, + "epoch": 0.7276, + "grad_norm": 5.035367965698242, + "kl": 0.2275390625, + "learning_rate": 6.361999999999999e-07, + "loss": 0.0091, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5457 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.8125, + "epoch": 0.7277333333333333, + "grad_norm": 4.705207347869873, + "kl": 0.2373046875, + "learning_rate": 6.361333333333333e-07, + "loss": 0.0095, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5458 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.4375, + "epoch": 0.7278666666666667, + "grad_norm": 5.62582540512085, + "kl": 0.24755859375, + "learning_rate": 6.360666666666666e-07, + "loss": 0.0099, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5459 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.25, + "epoch": 0.728, + "grad_norm": 4.577479839324951, + "kl": 0.157470703125, + "learning_rate": 6.36e-07, + "loss": 0.0063, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5460 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.4375, + "epoch": 0.7281333333333333, + "grad_norm": 0.5227499008178711, + "kl": 0.173828125, + "learning_rate": 6.359333333333333e-07, + "loss": 0.0069, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5461 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.1875, + "epoch": 0.7282666666666666, + "grad_norm": 5.76829195022583, + "kl": 0.1650390625, + "learning_rate": 6.358666666666666e-07, + "loss": 0.0066, + "reward": 1.625, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 5462 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.0625, + "epoch": 0.7284, + "grad_norm": 7.628405570983887, + "kl": 0.21728515625, + "learning_rate": 6.358e-07, + "loss": 0.0087, + "reward": 1.125, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 5463 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.0625, + "epoch": 0.7285333333333334, + "grad_norm": 4.703261852264404, + "kl": 0.1982421875, + "learning_rate": 6.357333333333333e-07, + "loss": 0.0079, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5464 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.625, + "epoch": 0.7286666666666667, + "grad_norm": 6.572201728820801, + "kl": 0.1435546875, + "learning_rate": 6.356666666666667e-07, + "loss": 0.0057, + "reward": 1.3125, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 5465 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.0, + "epoch": 0.7288, + "grad_norm": 9.653264999389648, + "kl": 0.23388671875, + "learning_rate": 6.356e-07, + "loss": 0.0093, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5466 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.9375, + "epoch": 0.7289333333333333, + "grad_norm": 5.500353813171387, + "kl": 0.1689453125, + "learning_rate": 6.355333333333333e-07, + "loss": 0.0068, + "reward": 1.25, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 5467 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.625, + "epoch": 0.7290666666666666, + "grad_norm": 3.6930696964263916, + "kl": 0.2783203125, + "learning_rate": 6.354666666666666e-07, + "loss": 0.0111, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 5468 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.25, + "epoch": 0.7292, + "grad_norm": 4.467801570892334, + "kl": 0.17041015625, + "learning_rate": 6.354e-07, + "loss": 0.0068, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5469 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.5625, + "epoch": 0.7293333333333333, + "grad_norm": 4.0995097160339355, + "kl": 0.17333984375, + "learning_rate": 6.353333333333333e-07, + "loss": 0.0069, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5470 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.0, + "epoch": 0.7294666666666667, + "grad_norm": 4.538815975189209, + "kl": 0.1513671875, + "learning_rate": 6.352666666666666e-07, + "loss": 0.0061, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5471 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.0625, + "epoch": 0.7296, + "grad_norm": 4.048537254333496, + "kl": 0.17431640625, + "learning_rate": 6.352e-07, + "loss": 0.007, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5472 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.75, + "epoch": 0.7297333333333333, + "grad_norm": 4.200996398925781, + "kl": 0.162109375, + "learning_rate": 6.351333333333333e-07, + "loss": 0.0065, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5473 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.5, + "epoch": 0.7298666666666667, + "grad_norm": 10.15750789642334, + "kl": 0.25537109375, + "learning_rate": 6.350666666666667e-07, + "loss": 0.0102, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 5474 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.1875, + "epoch": 0.73, + "grad_norm": 6.798830032348633, + "kl": 0.17529296875, + "learning_rate": 6.35e-07, + "loss": 0.007, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5475 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.25, + "epoch": 0.7301333333333333, + "grad_norm": 5.668441295623779, + "kl": 0.15771484375, + "learning_rate": 6.349333333333334e-07, + "loss": 0.0063, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 5476 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.5625, + "epoch": 0.7302666666666666, + "grad_norm": 7.6716227531433105, + "kl": 0.18603515625, + "learning_rate": 6.348666666666666e-07, + "loss": 0.0074, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5477 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.375, + "epoch": 0.7304, + "grad_norm": 83.88028717041016, + "kl": 0.24072265625, + "learning_rate": 6.348e-07, + "loss": 0.0096, + "reward": 1.4375, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5478 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.5625, + "epoch": 0.7305333333333334, + "grad_norm": 5.074739933013916, + "kl": 0.24755859375, + "learning_rate": 6.347333333333333e-07, + "loss": 0.0099, + "reward": 1.1875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 5479 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.1875, + "epoch": 0.7306666666666667, + "grad_norm": 5.8411664962768555, + "kl": 0.28271484375, + "learning_rate": 6.346666666666666e-07, + "loss": 0.0113, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5480 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.5, + "epoch": 0.7308, + "grad_norm": 3.3485724925994873, + "kl": 0.15234375, + "learning_rate": 6.346e-07, + "loss": 0.0061, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5481 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.5, + "epoch": 0.7309333333333333, + "grad_norm": 3.256795883178711, + "kl": 0.17333984375, + "learning_rate": 6.345333333333332e-07, + "loss": 0.0069, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5482 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.875, + "epoch": 0.7310666666666666, + "grad_norm": 5.101113796234131, + "kl": 0.1533203125, + "learning_rate": 6.344666666666666e-07, + "loss": 0.0061, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5483 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.625, + "epoch": 0.7312, + "grad_norm": 7.704946517944336, + "kl": 0.33203125, + "learning_rate": 6.343999999999999e-07, + "loss": 0.0133, + "reward": 1.25, + "reward_std": 0.579209566116333, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 5484 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.1875, + "epoch": 0.7313333333333333, + "grad_norm": 25.88686752319336, + "kl": 0.14990234375, + "learning_rate": 6.343333333333333e-07, + "loss": 0.006, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5485 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.125, + "epoch": 0.7314666666666667, + "grad_norm": 5.826079368591309, + "kl": 0.14697265625, + "learning_rate": 6.342666666666666e-07, + "loss": 0.0059, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5486 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.125, + "epoch": 0.7316, + "grad_norm": 4.490764141082764, + "kl": 0.23681640625, + "learning_rate": 6.342e-07, + "loss": 0.0095, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5487 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.5, + "epoch": 0.7317333333333333, + "grad_norm": 5.4401445388793945, + "kl": 0.13671875, + "learning_rate": 6.341333333333333e-07, + "loss": 0.0055, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5488 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.1875, + "epoch": 0.7318666666666667, + "grad_norm": 4.347394943237305, + "kl": 0.15283203125, + "learning_rate": 6.340666666666666e-07, + "loss": 0.0061, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5489 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.5, + "epoch": 0.732, + "grad_norm": 6.572088718414307, + "kl": 0.22998046875, + "learning_rate": 6.34e-07, + "loss": 0.0092, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5490 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.8125, + "epoch": 0.7321333333333333, + "grad_norm": 3.893571376800537, + "kl": 0.12548828125, + "learning_rate": 6.339333333333333e-07, + "loss": 0.005, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5491 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.9375, + "epoch": 0.7322666666666666, + "grad_norm": 5.559085845947266, + "kl": 0.18798828125, + "learning_rate": 6.338666666666667e-07, + "loss": 0.0075, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5492 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.6875, + "epoch": 0.7324, + "grad_norm": 6.742982864379883, + "kl": 0.201171875, + "learning_rate": 6.338e-07, + "loss": 0.0081, + "reward": 1.4375, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5493 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.5, + "epoch": 0.7325333333333334, + "grad_norm": 6.972675323486328, + "kl": 0.291015625, + "learning_rate": 6.337333333333334e-07, + "loss": 0.0116, + "reward": 1.1875, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 5494 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.4375, + "epoch": 0.7326666666666667, + "grad_norm": 6.640187740325928, + "kl": 0.1904296875, + "learning_rate": 6.336666666666667e-07, + "loss": 0.0076, + "reward": 1.4375, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5495 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.875, + "epoch": 0.7328, + "grad_norm": 10.322755813598633, + "kl": 0.48828125, + "learning_rate": 6.336000000000001e-07, + "loss": 0.0195, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.75, + "step": 5496 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.75, + "epoch": 0.7329333333333333, + "grad_norm": 3.7005834579467773, + "kl": 0.14501953125, + "learning_rate": 6.335333333333332e-07, + "loss": 0.0058, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5497 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.125, + "epoch": 0.7330666666666666, + "grad_norm": 5.333477973937988, + "kl": 0.1455078125, + "learning_rate": 6.334666666666665e-07, + "loss": 0.0058, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5498 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.3125, + "epoch": 0.7332, + "grad_norm": 4.414806842803955, + "kl": 0.23486328125, + "learning_rate": 6.333999999999999e-07, + "loss": 0.0094, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5499 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.8125, + "epoch": 0.7333333333333333, + "grad_norm": 4.577859878540039, + "kl": 0.20947265625, + "learning_rate": 6.333333333333332e-07, + "loss": 0.0084, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5500 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.6875, + "epoch": 0.7334666666666667, + "grad_norm": 5.241847038269043, + "kl": 0.128173828125, + "learning_rate": 6.332666666666666e-07, + "loss": 0.0051, + "reward": 0.9375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.875, + "step": 5501 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.25, + "epoch": 0.7336, + "grad_norm": 3.2712888717651367, + "kl": 0.220703125, + "learning_rate": 6.331999999999999e-07, + "loss": 0.0088, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5502 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.875, + "epoch": 0.7337333333333333, + "grad_norm": 6.417647361755371, + "kl": 0.2060546875, + "learning_rate": 6.331333333333333e-07, + "loss": 0.0083, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 5503 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.1875, + "epoch": 0.7338666666666667, + "grad_norm": 7.964112281799316, + "kl": 0.16845703125, + "learning_rate": 6.330666666666666e-07, + "loss": 0.0067, + "reward": 1.125, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 5504 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.25, + "epoch": 0.734, + "grad_norm": 6.090093612670898, + "kl": 0.1494140625, + "learning_rate": 6.33e-07, + "loss": 0.006, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5505 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.1875, + "epoch": 0.7341333333333333, + "grad_norm": 6.114781379699707, + "kl": 0.20556640625, + "learning_rate": 6.329333333333333e-07, + "loss": 0.0082, + "reward": 1.4375, + "reward_std": 0.7499763667583466, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 5506 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.9375, + "epoch": 0.7342666666666666, + "grad_norm": 6.913915634155273, + "kl": 0.1904296875, + "learning_rate": 6.328666666666667e-07, + "loss": 0.0076, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5507 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.125, + "epoch": 0.7344, + "grad_norm": 4.429660320281982, + "kl": 0.205078125, + "learning_rate": 6.328e-07, + "loss": 0.0082, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5508 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.8125, + "epoch": 0.7345333333333334, + "grad_norm": 8.037400245666504, + "kl": 0.2666015625, + "learning_rate": 6.327333333333333e-07, + "loss": 0.0106, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5509 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.75, + "epoch": 0.7346666666666667, + "grad_norm": 7.294204235076904, + "kl": 0.18603515625, + "learning_rate": 6.326666666666667e-07, + "loss": 0.0074, + "reward": 1.1875, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 5510 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.625, + "epoch": 0.7348, + "grad_norm": 3.6478593349456787, + "kl": 0.158203125, + "learning_rate": 6.326e-07, + "loss": 0.0063, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5511 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.4375, + "epoch": 0.7349333333333333, + "grad_norm": 6.557559013366699, + "kl": 0.1484375, + "learning_rate": 6.325333333333333e-07, + "loss": 0.0059, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5512 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.875, + "epoch": 0.7350666666666666, + "grad_norm": 11.961638450622559, + "kl": 0.1494140625, + "learning_rate": 6.324666666666666e-07, + "loss": 0.006, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5513 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.3125, + "epoch": 0.7352, + "grad_norm": 6.592728614807129, + "kl": 0.16748046875, + "learning_rate": 6.324e-07, + "loss": 0.0067, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5514 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.125, + "epoch": 0.7353333333333333, + "grad_norm": 4.460270881652832, + "kl": 0.17578125, + "learning_rate": 6.323333333333333e-07, + "loss": 0.007, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5515 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.25, + "epoch": 0.7354666666666667, + "grad_norm": 5.7776994705200195, + "kl": 0.232421875, + "learning_rate": 6.322666666666667e-07, + "loss": 0.0093, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5516 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.9375, + "epoch": 0.7356, + "grad_norm": 7.464561939239502, + "kl": 0.2451171875, + "learning_rate": 6.322e-07, + "loss": 0.0098, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 5517 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.625, + "epoch": 0.7357333333333334, + "grad_norm": 0.3427343964576721, + "kl": 0.2705078125, + "learning_rate": 6.321333333333332e-07, + "loss": 0.0108, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5518 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.8125, + "epoch": 0.7358666666666667, + "grad_norm": 5.340330123901367, + "kl": 0.15283203125, + "learning_rate": 6.320666666666666e-07, + "loss": 0.0061, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5519 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.25, + "epoch": 0.736, + "grad_norm": 6.327037334442139, + "kl": 0.17529296875, + "learning_rate": 6.319999999999999e-07, + "loss": 0.007, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5520 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.4375, + "epoch": 0.7361333333333333, + "grad_norm": 9.05171012878418, + "kl": 0.2666015625, + "learning_rate": 6.319333333333333e-07, + "loss": 0.0107, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 5521 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.25, + "epoch": 0.7362666666666666, + "grad_norm": 4.882406234741211, + "kl": 0.1904296875, + "learning_rate": 6.318666666666666e-07, + "loss": 0.0076, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5522 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.125, + "epoch": 0.7364, + "grad_norm": 7.321242809295654, + "kl": 0.1845703125, + "learning_rate": 6.318e-07, + "loss": 0.0074, + "reward": 1.375, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 5523 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.625, + "epoch": 0.7365333333333334, + "grad_norm": 0.33037322759628296, + "kl": 0.18408203125, + "learning_rate": 6.317333333333333e-07, + "loss": 0.0074, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5524 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.75, + "epoch": 0.7366666666666667, + "grad_norm": 11.009464263916016, + "kl": 0.20751953125, + "learning_rate": 6.316666666666667e-07, + "loss": 0.0083, + "reward": 1.3125, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 5525 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.5, + "epoch": 0.7368, + "grad_norm": 10.320897102355957, + "kl": 0.17919921875, + "learning_rate": 6.316e-07, + "loss": 0.0072, + "reward": 1.25, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 5526 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.8125, + "epoch": 0.7369333333333333, + "grad_norm": 7.770620822906494, + "kl": 0.1826171875, + "learning_rate": 6.315333333333332e-07, + "loss": 0.0073, + "reward": 1.625, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 5527 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.75, + "epoch": 0.7370666666666666, + "grad_norm": 0.33623725175857544, + "kl": 0.22900390625, + "learning_rate": 6.314666666666666e-07, + "loss": 0.0092, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5528 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.75, + "epoch": 0.7372, + "grad_norm": 11.904500961303711, + "kl": 0.1455078125, + "learning_rate": 6.313999999999999e-07, + "loss": 0.0058, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5529 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.0625, + "epoch": 0.7373333333333333, + "grad_norm": 5.349250316619873, + "kl": 0.20654296875, + "learning_rate": 6.313333333333333e-07, + "loss": 0.0082, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5530 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.5, + "epoch": 0.7374666666666667, + "grad_norm": 12.197196960449219, + "kl": 0.19384765625, + "learning_rate": 6.312666666666666e-07, + "loss": 0.0077, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5531 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.75, + "epoch": 0.7376, + "grad_norm": 12.415163040161133, + "kl": 0.333984375, + "learning_rate": 6.312e-07, + "loss": 0.0134, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5532 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.6875, + "epoch": 0.7377333333333334, + "grad_norm": 8.549888610839844, + "kl": 0.20556640625, + "learning_rate": 6.311333333333333e-07, + "loss": 0.0082, + "reward": 1.6875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 5533 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.25, + "epoch": 0.7378666666666667, + "grad_norm": 3.8328933715820312, + "kl": 0.158203125, + "learning_rate": 6.310666666666667e-07, + "loss": 0.0063, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5534 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.9375, + "epoch": 0.738, + "grad_norm": 6.55081844329834, + "kl": 0.148681640625, + "learning_rate": 6.31e-07, + "loss": 0.006, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5535 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.8125, + "epoch": 0.7381333333333333, + "grad_norm": 15.317352294921875, + "kl": 0.177734375, + "learning_rate": 6.309333333333333e-07, + "loss": 0.0071, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5536 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.3125, + "epoch": 0.7382666666666666, + "grad_norm": 7.030661106109619, + "kl": 0.17138671875, + "learning_rate": 6.308666666666667e-07, + "loss": 0.0069, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5537 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.375, + "epoch": 0.7384, + "grad_norm": 5.142753601074219, + "kl": 0.26123046875, + "learning_rate": 6.308e-07, + "loss": 0.0104, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5538 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.375, + "epoch": 0.7385333333333334, + "grad_norm": 7.717313289642334, + "kl": 0.15966796875, + "learning_rate": 6.307333333333334e-07, + "loss": 0.0064, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5539 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.375, + "epoch": 0.7386666666666667, + "grad_norm": 4.237880229949951, + "kl": 0.20654296875, + "learning_rate": 6.306666666666666e-07, + "loss": 0.0083, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5540 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.3125, + "epoch": 0.7388, + "grad_norm": 7.165454387664795, + "kl": 0.224609375, + "learning_rate": 6.306e-07, + "loss": 0.009, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5541 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.5, + "epoch": 0.7389333333333333, + "grad_norm": 7.689726829528809, + "kl": 0.203125, + "learning_rate": 6.305333333333332e-07, + "loss": 0.0081, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5542 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.75, + "epoch": 0.7390666666666666, + "grad_norm": 6.490419864654541, + "kl": 0.2578125, + "learning_rate": 6.304666666666666e-07, + "loss": 0.0103, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5543 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.8125, + "epoch": 0.7392, + "grad_norm": 4.4886016845703125, + "kl": 0.15673828125, + "learning_rate": 6.303999999999999e-07, + "loss": 0.0063, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5544 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.0, + "epoch": 0.7393333333333333, + "grad_norm": 12.00925064086914, + "kl": 0.333984375, + "learning_rate": 6.303333333333332e-07, + "loss": 0.0133, + "reward": 1.3125, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 5545 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.875, + "epoch": 0.7394666666666667, + "grad_norm": 0.33964964747428894, + "kl": 0.16796875, + "learning_rate": 6.302666666666666e-07, + "loss": 0.0067, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5546 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.625, + "epoch": 0.7396, + "grad_norm": 92.24022674560547, + "kl": 0.1474609375, + "learning_rate": 6.301999999999999e-07, + "loss": 0.0059, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 5547 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.875, + "epoch": 0.7397333333333334, + "grad_norm": 5.745964527130127, + "kl": 0.158203125, + "learning_rate": 6.301333333333333e-07, + "loss": 0.0063, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 5548 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.125, + "epoch": 0.7398666666666667, + "grad_norm": 7.665029048919678, + "kl": 0.154296875, + "learning_rate": 6.300666666666666e-07, + "loss": 0.0062, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5549 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.5, + "epoch": 0.74, + "grad_norm": 0.46717679500579834, + "kl": 0.21044921875, + "learning_rate": 6.3e-07, + "loss": 0.0084, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5550 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.1875, + "epoch": 0.7401333333333333, + "grad_norm": 5.799502372741699, + "kl": 0.1630859375, + "learning_rate": 6.299333333333333e-07, + "loss": 0.0065, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5551 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.3125, + "epoch": 0.7402666666666666, + "grad_norm": 5.382107257843018, + "kl": 0.136962890625, + "learning_rate": 6.298666666666667e-07, + "loss": 0.0055, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5552 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.9375, + "epoch": 0.7404, + "grad_norm": 9.023892402648926, + "kl": 0.28515625, + "learning_rate": 6.298e-07, + "loss": 0.0114, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 5553 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.4375, + "epoch": 0.7405333333333334, + "grad_norm": 2.973264217376709, + "kl": 0.120361328125, + "learning_rate": 6.297333333333334e-07, + "loss": 0.0048, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 5554 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.6875, + "epoch": 0.7406666666666667, + "grad_norm": 14.108256340026855, + "kl": 0.2744140625, + "learning_rate": 6.296666666666667e-07, + "loss": 0.011, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5555 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.875, + "epoch": 0.7408, + "grad_norm": 0.3332856595516205, + "kl": 0.19921875, + "learning_rate": 6.296e-07, + "loss": 0.008, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5556 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.1875, + "epoch": 0.7409333333333333, + "grad_norm": 5.571115970611572, + "kl": 0.255859375, + "learning_rate": 6.295333333333333e-07, + "loss": 0.0103, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5557 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.625, + "epoch": 0.7410666666666667, + "grad_norm": 6.846307277679443, + "kl": 0.125, + "learning_rate": 6.294666666666666e-07, + "loss": 0.005, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5558 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.125, + "epoch": 0.7412, + "grad_norm": 0.3361679017543793, + "kl": 0.20849609375, + "learning_rate": 6.294e-07, + "loss": 0.0083, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5559 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.1875, + "epoch": 0.7413333333333333, + "grad_norm": 7.74051570892334, + "kl": 0.4208984375, + "learning_rate": 6.293333333333333e-07, + "loss": 0.0168, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5560 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.0, + "epoch": 0.7414666666666667, + "grad_norm": 9.351319313049316, + "kl": 0.2119140625, + "learning_rate": 6.292666666666667e-07, + "loss": 0.0085, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5561 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.5, + "epoch": 0.7416, + "grad_norm": 5.700804233551025, + "kl": 0.15185546875, + "learning_rate": 6.291999999999999e-07, + "loss": 0.0061, + "reward": 1.0, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 5562 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.8125, + "epoch": 0.7417333333333334, + "grad_norm": 5.225759029388428, + "kl": 0.22998046875, + "learning_rate": 6.291333333333333e-07, + "loss": 0.0092, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5563 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.3125, + "epoch": 0.7418666666666667, + "grad_norm": 5.579074382781982, + "kl": 0.13232421875, + "learning_rate": 6.290666666666666e-07, + "loss": 0.0053, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5564 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.5, + "epoch": 0.742, + "grad_norm": 4.452743053436279, + "kl": 0.20751953125, + "learning_rate": 6.289999999999999e-07, + "loss": 0.0083, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5565 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.9375, + "epoch": 0.7421333333333333, + "grad_norm": 8.462122917175293, + "kl": 0.19580078125, + "learning_rate": 6.289333333333333e-07, + "loss": 0.0078, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5566 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.625, + "epoch": 0.7422666666666666, + "grad_norm": 5.8625054359436035, + "kl": 0.119384765625, + "learning_rate": 6.288666666666666e-07, + "loss": 0.0048, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5567 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.0, + "epoch": 0.7424, + "grad_norm": 7.656648635864258, + "kl": 0.17529296875, + "learning_rate": 6.288e-07, + "loss": 0.007, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5568 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.4375, + "epoch": 0.7425333333333334, + "grad_norm": 43.48297882080078, + "kl": 0.17822265625, + "learning_rate": 6.287333333333333e-07, + "loss": 0.0071, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 5569 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.625, + "epoch": 0.7426666666666667, + "grad_norm": 8.404118537902832, + "kl": 0.22265625, + "learning_rate": 6.286666666666667e-07, + "loss": 0.0089, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5570 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.625, + "epoch": 0.7428, + "grad_norm": 6.274819850921631, + "kl": 0.143310546875, + "learning_rate": 6.286e-07, + "loss": 0.0057, + "reward": 1.6875, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 5571 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.25, + "epoch": 0.7429333333333333, + "grad_norm": 10.509387016296387, + "kl": 0.35888671875, + "learning_rate": 6.285333333333334e-07, + "loss": 0.0144, + "reward": 1.5625, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5572 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.125, + "epoch": 0.7430666666666667, + "grad_norm": 4.284574508666992, + "kl": 0.17333984375, + "learning_rate": 6.284666666666666e-07, + "loss": 0.0069, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5573 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.5625, + "epoch": 0.7432, + "grad_norm": 21.18536949157715, + "kl": 0.1416015625, + "learning_rate": 6.283999999999999e-07, + "loss": 0.0056, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 5574 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.0625, + "epoch": 0.7433333333333333, + "grad_norm": 9.075666427612305, + "kl": 0.155029296875, + "learning_rate": 6.283333333333333e-07, + "loss": 0.0062, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5575 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.9375, + "epoch": 0.7434666666666667, + "grad_norm": 3.853259801864624, + "kl": 0.16455078125, + "learning_rate": 6.282666666666666e-07, + "loss": 0.0066, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5576 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.25, + "epoch": 0.7436, + "grad_norm": 0.304538369178772, + "kl": 0.14794921875, + "learning_rate": 6.282e-07, + "loss": 0.0059, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 5577 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.375, + "epoch": 0.7437333333333334, + "grad_norm": 0.33264538645744324, + "kl": 0.188720703125, + "learning_rate": 6.281333333333333e-07, + "loss": 0.0076, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5578 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.25, + "epoch": 0.7438666666666667, + "grad_norm": 15.747987747192383, + "kl": 0.27734375, + "learning_rate": 6.280666666666667e-07, + "loss": 0.0111, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5579 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.4375, + "epoch": 0.744, + "grad_norm": 7.044922828674316, + "kl": 0.2060546875, + "learning_rate": 6.28e-07, + "loss": 0.0082, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5580 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.5, + "epoch": 0.7441333333333333, + "grad_norm": 4.77008581161499, + "kl": 0.15478515625, + "learning_rate": 6.279333333333334e-07, + "loss": 0.0062, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 5581 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.625, + "epoch": 0.7442666666666666, + "grad_norm": 4.835831642150879, + "kl": 0.143798828125, + "learning_rate": 6.278666666666667e-07, + "loss": 0.0057, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5582 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.4375, + "epoch": 0.7444, + "grad_norm": 8.693388938903809, + "kl": 0.24951171875, + "learning_rate": 6.277999999999999e-07, + "loss": 0.01, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5583 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.6875, + "epoch": 0.7445333333333334, + "grad_norm": 4.258063316345215, + "kl": 0.15576171875, + "learning_rate": 6.277333333333333e-07, + "loss": 0.0062, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5584 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.3125, + "epoch": 0.7446666666666667, + "grad_norm": 4.292876243591309, + "kl": 0.25634765625, + "learning_rate": 6.276666666666666e-07, + "loss": 0.0102, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5585 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.75, + "epoch": 0.7448, + "grad_norm": 5.6189422607421875, + "kl": 0.1787109375, + "learning_rate": 6.276e-07, + "loss": 0.0072, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 5586 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.3125, + "epoch": 0.7449333333333333, + "grad_norm": 7.103560924530029, + "kl": 0.21435546875, + "learning_rate": 6.275333333333333e-07, + "loss": 0.0086, + "reward": 1.4375, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5587 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.9375, + "epoch": 0.7450666666666667, + "grad_norm": 4.14871072769165, + "kl": 0.20361328125, + "learning_rate": 6.274666666666666e-07, + "loss": 0.0081, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5588 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.9375, + "epoch": 0.7452, + "grad_norm": 4.98500919342041, + "kl": 0.146484375, + "learning_rate": 6.273999999999999e-07, + "loss": 0.0059, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5589 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.75, + "epoch": 0.7453333333333333, + "grad_norm": 8.405673027038574, + "kl": 0.17626953125, + "learning_rate": 6.273333333333333e-07, + "loss": 0.0071, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5590 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.8125, + "epoch": 0.7454666666666667, + "grad_norm": 7.432627201080322, + "kl": 0.17041015625, + "learning_rate": 6.272666666666666e-07, + "loss": 0.0068, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5591 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.5625, + "epoch": 0.7456, + "grad_norm": 0.3528733253479004, + "kl": 0.1591796875, + "learning_rate": 6.271999999999999e-07, + "loss": 0.0064, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 5592 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.5, + "epoch": 0.7457333333333334, + "grad_norm": 6.60576057434082, + "kl": 0.28125, + "learning_rate": 6.271333333333333e-07, + "loss": 0.0113, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5593 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.375, + "epoch": 0.7458666666666667, + "grad_norm": 3.821899652481079, + "kl": 0.20068359375, + "learning_rate": 6.270666666666666e-07, + "loss": 0.008, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5594 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.3125, + "epoch": 0.746, + "grad_norm": 7.130596160888672, + "kl": 0.156494140625, + "learning_rate": 6.27e-07, + "loss": 0.0063, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5595 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.125, + "epoch": 0.7461333333333333, + "grad_norm": 5.386151313781738, + "kl": 0.17333984375, + "learning_rate": 6.269333333333333e-07, + "loss": 0.007, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5596 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.4375, + "epoch": 0.7462666666666666, + "grad_norm": 5.260470867156982, + "kl": 0.23046875, + "learning_rate": 6.268666666666667e-07, + "loss": 0.0092, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5597 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.375, + "epoch": 0.7464, + "grad_norm": 7.355652332305908, + "kl": 0.16796875, + "learning_rate": 6.268e-07, + "loss": 0.0067, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5598 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.625, + "epoch": 0.7465333333333334, + "grad_norm": 6.294168949127197, + "kl": 0.20703125, + "learning_rate": 6.267333333333334e-07, + "loss": 0.0083, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5599 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.6875, + "epoch": 0.7466666666666667, + "grad_norm": 4.9211530685424805, + "kl": 0.16552734375, + "learning_rate": 6.266666666666667e-07, + "loss": 0.0066, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5600 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.75, + "epoch": 0.7468, + "grad_norm": 6.441906452178955, + "kl": 0.2646484375, + "learning_rate": 6.266000000000001e-07, + "loss": 0.0106, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 5601 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.625, + "epoch": 0.7469333333333333, + "grad_norm": 9.754861831665039, + "kl": 0.158203125, + "learning_rate": 6.265333333333334e-07, + "loss": 0.0063, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5602 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.9375, + "epoch": 0.7470666666666667, + "grad_norm": 24.13490867614746, + "kl": 0.20654296875, + "learning_rate": 6.264666666666665e-07, + "loss": 0.0083, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 5603 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.3125, + "epoch": 0.7472, + "grad_norm": 9.43064022064209, + "kl": 0.1748046875, + "learning_rate": 6.263999999999999e-07, + "loss": 0.007, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 5604 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.0625, + "epoch": 0.7473333333333333, + "grad_norm": 8.24328899383545, + "kl": 0.2744140625, + "learning_rate": 6.263333333333332e-07, + "loss": 0.011, + "reward": 1.125, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.875, + "step": 5605 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.125, + "epoch": 0.7474666666666666, + "grad_norm": 3.8685266971588135, + "kl": 0.1474609375, + "learning_rate": 6.262666666666666e-07, + "loss": 0.0059, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 5606 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.4375, + "epoch": 0.7476, + "grad_norm": 4.755876541137695, + "kl": 0.14892578125, + "learning_rate": 6.261999999999999e-07, + "loss": 0.006, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5607 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.625, + "epoch": 0.7477333333333334, + "grad_norm": 6.635544776916504, + "kl": 0.17822265625, + "learning_rate": 6.261333333333333e-07, + "loss": 0.0071, + "reward": 1.6875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 5608 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.75, + "epoch": 0.7478666666666667, + "grad_norm": 10.738550186157227, + "kl": 0.21435546875, + "learning_rate": 6.260666666666666e-07, + "loss": 0.0086, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5609 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.3125, + "epoch": 0.748, + "grad_norm": 3.9080731868743896, + "kl": 0.11865234375, + "learning_rate": 6.26e-07, + "loss": 0.0047, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5610 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.8125, + "epoch": 0.7481333333333333, + "grad_norm": 17.31608772277832, + "kl": 0.4150390625, + "learning_rate": 6.259333333333333e-07, + "loss": 0.0166, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.875, + "step": 5611 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.9375, + "epoch": 0.7482666666666666, + "grad_norm": 6.818905353546143, + "kl": 0.22119140625, + "learning_rate": 6.258666666666666e-07, + "loss": 0.0088, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5612 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.8125, + "epoch": 0.7484, + "grad_norm": 4.8022565841674805, + "kl": 0.224609375, + "learning_rate": 6.258e-07, + "loss": 0.009, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5613 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.8125, + "epoch": 0.7485333333333334, + "grad_norm": 5.29205322265625, + "kl": 0.2021484375, + "learning_rate": 6.257333333333333e-07, + "loss": 0.0081, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5614 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.125, + "epoch": 0.7486666666666667, + "grad_norm": 5.475064754486084, + "kl": 0.2294921875, + "learning_rate": 6.256666666666667e-07, + "loss": 0.0092, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5615 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.6875, + "epoch": 0.7488, + "grad_norm": 5.4646100997924805, + "kl": 0.2158203125, + "learning_rate": 6.256e-07, + "loss": 0.0086, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5616 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.3125, + "epoch": 0.7489333333333333, + "grad_norm": 14.425616264343262, + "kl": 0.21484375, + "learning_rate": 6.255333333333334e-07, + "loss": 0.0086, + "reward": 1.375, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5617 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.3125, + "epoch": 0.7490666666666667, + "grad_norm": 7.961090087890625, + "kl": 0.267578125, + "learning_rate": 6.254666666666666e-07, + "loss": 0.0107, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5618 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.125, + "epoch": 0.7492, + "grad_norm": 0.4059627950191498, + "kl": 0.1982421875, + "learning_rate": 6.254e-07, + "loss": 0.0079, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5619 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.6875, + "epoch": 0.7493333333333333, + "grad_norm": 56.23192596435547, + "kl": 0.228515625, + "learning_rate": 6.253333333333333e-07, + "loss": 0.0092, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5620 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.5625, + "epoch": 0.7494666666666666, + "grad_norm": 8.57297134399414, + "kl": 0.244140625, + "learning_rate": 6.252666666666666e-07, + "loss": 0.0098, + "reward": 1.5, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5621 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.375, + "epoch": 0.7496, + "grad_norm": 6.369668483734131, + "kl": 0.18994140625, + "learning_rate": 6.252e-07, + "loss": 0.0076, + "reward": 1.3125, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 5622 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.0, + "epoch": 0.7497333333333334, + "grad_norm": 7.637110710144043, + "kl": 0.24072265625, + "learning_rate": 6.251333333333333e-07, + "loss": 0.0096, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5623 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.125, + "epoch": 0.7498666666666667, + "grad_norm": 0.2904861271381378, + "kl": 0.20361328125, + "learning_rate": 6.250666666666667e-07, + "loss": 0.0082, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5624 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.625, + "epoch": 0.75, + "grad_norm": 4.829680442810059, + "kl": 0.18603515625, + "learning_rate": 6.249999999999999e-07, + "loss": 0.0074, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.3125, + "epoch": 0.7501333333333333, + "grad_norm": 3.499498128890991, + "kl": 0.216796875, + "learning_rate": 6.249333333333333e-07, + "loss": 0.0087, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5626 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.125, + "epoch": 0.7502666666666666, + "grad_norm": 6.897875785827637, + "kl": 0.21533203125, + "learning_rate": 6.248666666666666e-07, + "loss": 0.0086, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 5627 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.9375, + "epoch": 0.7504, + "grad_norm": 4.835118770599365, + "kl": 0.12548828125, + "learning_rate": 6.248e-07, + "loss": 0.005, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 5628 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.75, + "epoch": 0.7505333333333334, + "grad_norm": 6.202419757843018, + "kl": 0.24365234375, + "learning_rate": 6.247333333333333e-07, + "loss": 0.0097, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5629 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.5625, + "epoch": 0.7506666666666667, + "grad_norm": 8.183453559875488, + "kl": 0.306640625, + "learning_rate": 6.246666666666667e-07, + "loss": 0.0123, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5630 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.25, + "epoch": 0.7508, + "grad_norm": 8.026069641113281, + "kl": 0.2783203125, + "learning_rate": 6.246e-07, + "loss": 0.0111, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5631 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.9375, + "epoch": 0.7509333333333333, + "grad_norm": 9.729825973510742, + "kl": 0.37890625, + "learning_rate": 6.245333333333333e-07, + "loss": 0.0151, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 5632 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.4375, + "epoch": 0.7510666666666667, + "grad_norm": 8.076676368713379, + "kl": 0.20556640625, + "learning_rate": 6.244666666666666e-07, + "loss": 0.0082, + "reward": 1.125, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 5633 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.1875, + "epoch": 0.7512, + "grad_norm": 8.785399436950684, + "kl": 0.34716796875, + "learning_rate": 6.243999999999999e-07, + "loss": 0.0138, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5634 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.1875, + "epoch": 0.7513333333333333, + "grad_norm": 4.0094475746154785, + "kl": 0.2333984375, + "learning_rate": 6.243333333333333e-07, + "loss": 0.0093, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 5635 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.125, + "epoch": 0.7514666666666666, + "grad_norm": 7.609216690063477, + "kl": 0.1591796875, + "learning_rate": 6.242666666666666e-07, + "loss": 0.0063, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5636 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.125, + "epoch": 0.7516, + "grad_norm": 4.08499002456665, + "kl": 0.18603515625, + "learning_rate": 6.242e-07, + "loss": 0.0074, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5637 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.75, + "epoch": 0.7517333333333334, + "grad_norm": 6.085355758666992, + "kl": 0.21728515625, + "learning_rate": 6.241333333333333e-07, + "loss": 0.0087, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 5638 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.6875, + "epoch": 0.7518666666666667, + "grad_norm": 18.08646011352539, + "kl": 0.248046875, + "learning_rate": 6.240666666666667e-07, + "loss": 0.0099, + "reward": 1.3125, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 5639 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.0, + "epoch": 0.752, + "grad_norm": 19.61390495300293, + "kl": 0.15771484375, + "learning_rate": 6.24e-07, + "loss": 0.0063, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5640 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.0625, + "epoch": 0.7521333333333333, + "grad_norm": 0.6609634757041931, + "kl": 0.166015625, + "learning_rate": 6.239333333333333e-07, + "loss": 0.0066, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 5641 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.3125, + "epoch": 0.7522666666666666, + "grad_norm": 5.81947660446167, + "kl": 0.14453125, + "learning_rate": 6.238666666666667e-07, + "loss": 0.0058, + "reward": 1.25, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5642 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.75, + "epoch": 0.7524, + "grad_norm": 0.8525881767272949, + "kl": 0.220703125, + "learning_rate": 6.238e-07, + "loss": 0.0088, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5643 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.8125, + "epoch": 0.7525333333333334, + "grad_norm": 5.257746696472168, + "kl": 0.19677734375, + "learning_rate": 6.237333333333334e-07, + "loss": 0.0079, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5644 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.0, + "epoch": 0.7526666666666667, + "grad_norm": 5.3414812088012695, + "kl": 0.138671875, + "learning_rate": 6.236666666666667e-07, + "loss": 0.0056, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5645 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.125, + "epoch": 0.7528, + "grad_norm": 3.4290215969085693, + "kl": 0.14697265625, + "learning_rate": 6.236e-07, + "loss": 0.0059, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5646 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.5625, + "epoch": 0.7529333333333333, + "grad_norm": 4.168577194213867, + "kl": 0.15087890625, + "learning_rate": 6.235333333333333e-07, + "loss": 0.006, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5647 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.875, + "epoch": 0.7530666666666667, + "grad_norm": 4.409804821014404, + "kl": 0.1572265625, + "learning_rate": 6.234666666666666e-07, + "loss": 0.0063, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5648 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.0, + "epoch": 0.7532, + "grad_norm": 5.840556621551514, + "kl": 0.1728515625, + "learning_rate": 6.233999999999999e-07, + "loss": 0.0069, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 5649 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.1875, + "epoch": 0.7533333333333333, + "grad_norm": 4.682425022125244, + "kl": 0.20947265625, + "learning_rate": 6.233333333333332e-07, + "loss": 0.0084, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5650 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.0625, + "epoch": 0.7534666666666666, + "grad_norm": 4.980949878692627, + "kl": 0.14892578125, + "learning_rate": 6.232666666666666e-07, + "loss": 0.006, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5651 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.25, + "epoch": 0.7536, + "grad_norm": 5.095736503601074, + "kl": 0.1650390625, + "learning_rate": 6.231999999999999e-07, + "loss": 0.0066, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 5652 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.0625, + "epoch": 0.7537333333333334, + "grad_norm": 6.515046119689941, + "kl": 0.15576171875, + "learning_rate": 6.231333333333333e-07, + "loss": 0.0062, + "reward": 1.3125, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 5653 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.9375, + "epoch": 0.7538666666666667, + "grad_norm": 8.675559997558594, + "kl": 0.169921875, + "learning_rate": 6.230666666666666e-07, + "loss": 0.0068, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5654 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.3125, + "epoch": 0.754, + "grad_norm": 6.16168737411499, + "kl": 0.20849609375, + "learning_rate": 6.23e-07, + "loss": 0.0083, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5655 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.6875, + "epoch": 0.7541333333333333, + "grad_norm": 5.0770697593688965, + "kl": 0.17822265625, + "learning_rate": 6.229333333333333e-07, + "loss": 0.0071, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5656 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.8125, + "epoch": 0.7542666666666666, + "grad_norm": 6.543984413146973, + "kl": 0.1884765625, + "learning_rate": 6.228666666666667e-07, + "loss": 0.0075, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 5657 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.8125, + "epoch": 0.7544, + "grad_norm": 0.312248557806015, + "kl": 0.1669921875, + "learning_rate": 6.228e-07, + "loss": 0.0067, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 5658 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.625, + "epoch": 0.7545333333333333, + "grad_norm": 5.361613750457764, + "kl": 0.14306640625, + "learning_rate": 6.227333333333333e-07, + "loss": 0.0057, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 5659 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.375, + "epoch": 0.7546666666666667, + "grad_norm": 4.223623275756836, + "kl": 0.1328125, + "learning_rate": 6.226666666666667e-07, + "loss": 0.0053, + "reward": 1.25, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 5660 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.75, + "epoch": 0.7548, + "grad_norm": 4.795492172241211, + "kl": 0.15576171875, + "learning_rate": 6.226e-07, + "loss": 0.0062, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5661 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.25, + "epoch": 0.7549333333333333, + "grad_norm": 6.5028581619262695, + "kl": 0.19384765625, + "learning_rate": 6.225333333333334e-07, + "loss": 0.0078, + "reward": 1.75, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5662 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.875, + "epoch": 0.7550666666666667, + "grad_norm": 4.733079433441162, + "kl": 0.130126953125, + "learning_rate": 6.224666666666666e-07, + "loss": 0.0052, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5663 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.875, + "epoch": 0.7552, + "grad_norm": 0.312053382396698, + "kl": 0.16943359375, + "learning_rate": 6.224e-07, + "loss": 0.0068, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5664 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.9375, + "epoch": 0.7553333333333333, + "grad_norm": 4.488484859466553, + "kl": 0.130859375, + "learning_rate": 6.223333333333333e-07, + "loss": 0.0052, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5665 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.625, + "epoch": 0.7554666666666666, + "grad_norm": 5.414924144744873, + "kl": 0.189453125, + "learning_rate": 6.222666666666667e-07, + "loss": 0.0076, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5666 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.625, + "epoch": 0.7556, + "grad_norm": 3.609905958175659, + "kl": 0.14453125, + "learning_rate": 6.221999999999999e-07, + "loss": 0.0058, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 5667 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.8125, + "epoch": 0.7557333333333334, + "grad_norm": 10.327569007873535, + "kl": 0.177978515625, + "learning_rate": 6.221333333333332e-07, + "loss": 0.0071, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5668 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.625, + "epoch": 0.7558666666666667, + "grad_norm": 3.8893520832061768, + "kl": 0.14013671875, + "learning_rate": 6.220666666666666e-07, + "loss": 0.0056, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 5669 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.125, + "epoch": 0.756, + "grad_norm": 4.446323394775391, + "kl": 0.17919921875, + "learning_rate": 6.219999999999999e-07, + "loss": 0.0072, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5670 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.375, + "epoch": 0.7561333333333333, + "grad_norm": 6.957337379455566, + "kl": 0.1748046875, + "learning_rate": 6.219333333333333e-07, + "loss": 0.007, + "reward": 1.25, + "reward_std": 0.7490041851997375, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.8125, + "step": 5671 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.9375, + "epoch": 0.7562666666666666, + "grad_norm": 5.598616600036621, + "kl": 0.134521484375, + "learning_rate": 6.218666666666666e-07, + "loss": 0.0054, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5672 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.6875, + "epoch": 0.7564, + "grad_norm": 0.2731248140335083, + "kl": 0.162109375, + "learning_rate": 6.218e-07, + "loss": 0.0065, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5673 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.375, + "epoch": 0.7565333333333333, + "grad_norm": 6.8868584632873535, + "kl": 0.28125, + "learning_rate": 6.217333333333333e-07, + "loss": 0.0113, + "reward": 1.3125, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 5674 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.8125, + "epoch": 0.7566666666666667, + "grad_norm": 7.578523635864258, + "kl": 0.19873046875, + "learning_rate": 6.216666666666667e-07, + "loss": 0.0079, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5675 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.9375, + "epoch": 0.7568, + "grad_norm": 22.120914459228516, + "kl": 0.18017578125, + "learning_rate": 6.216e-07, + "loss": 0.0072, + "reward": 1.4375, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5676 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.3125, + "epoch": 0.7569333333333333, + "grad_norm": 6.295637130737305, + "kl": 0.20068359375, + "learning_rate": 6.215333333333334e-07, + "loss": 0.008, + "reward": 1.3125, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 5677 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.1875, + "epoch": 0.7570666666666667, + "grad_norm": 3.977482557296753, + "kl": 0.169921875, + "learning_rate": 6.214666666666666e-07, + "loss": 0.0068, + "reward": 1.3125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 5678 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.625, + "epoch": 0.7572, + "grad_norm": 8.57911491394043, + "kl": 0.2138671875, + "learning_rate": 6.213999999999999e-07, + "loss": 0.0085, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 5679 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.625, + "epoch": 0.7573333333333333, + "grad_norm": 5.845137596130371, + "kl": 0.14501953125, + "learning_rate": 6.213333333333333e-07, + "loss": 0.0058, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5680 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.5625, + "epoch": 0.7574666666666666, + "grad_norm": 4.523271560668945, + "kl": 0.16943359375, + "learning_rate": 6.212666666666666e-07, + "loss": 0.0068, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5681 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.75, + "epoch": 0.7576, + "grad_norm": 6.578713417053223, + "kl": 0.1484375, + "learning_rate": 6.212e-07, + "loss": 0.006, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5682 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.3125, + "epoch": 0.7577333333333334, + "grad_norm": 21.380882263183594, + "kl": 0.60302734375, + "learning_rate": 6.211333333333333e-07, + "loss": 0.024, + "reward": 1.125, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 5683 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.5, + "epoch": 0.7578666666666667, + "grad_norm": 9.246201515197754, + "kl": 0.1748046875, + "learning_rate": 6.210666666666667e-07, + "loss": 0.007, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5684 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.4375, + "epoch": 0.758, + "grad_norm": 7.757384777069092, + "kl": 0.205078125, + "learning_rate": 6.21e-07, + "loss": 0.0082, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5685 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.5, + "epoch": 0.7581333333333333, + "grad_norm": 6.629530429840088, + "kl": 0.15478515625, + "learning_rate": 6.209333333333334e-07, + "loss": 0.0062, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5686 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.8125, + "epoch": 0.7582666666666666, + "grad_norm": 6.5881195068359375, + "kl": 0.142578125, + "learning_rate": 6.208666666666667e-07, + "loss": 0.0057, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5687 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.3125, + "epoch": 0.7584, + "grad_norm": 4.991456508636475, + "kl": 0.16455078125, + "learning_rate": 6.208e-07, + "loss": 0.0066, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 5688 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.5, + "epoch": 0.7585333333333333, + "grad_norm": 5.912076950073242, + "kl": 0.228515625, + "learning_rate": 6.207333333333333e-07, + "loss": 0.0091, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5689 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.1875, + "epoch": 0.7586666666666667, + "grad_norm": 4.58439302444458, + "kl": 0.12548828125, + "learning_rate": 6.206666666666666e-07, + "loss": 0.005, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5690 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.8125, + "epoch": 0.7588, + "grad_norm": 7.3480143547058105, + "kl": 0.25048828125, + "learning_rate": 6.206e-07, + "loss": 0.01, + "reward": 1.5, + "reward_std": 0.7071067690849304, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 5691 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.0625, + "epoch": 0.7589333333333333, + "grad_norm": 4.580042839050293, + "kl": 0.140625, + "learning_rate": 6.205333333333333e-07, + "loss": 0.0056, + "reward": 1.0625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 5692 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.625, + "epoch": 0.7590666666666667, + "grad_norm": 7.7823286056518555, + "kl": 0.17724609375, + "learning_rate": 6.204666666666667e-07, + "loss": 0.0071, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5693 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.5, + "epoch": 0.7592, + "grad_norm": 5.441606044769287, + "kl": 0.179443359375, + "learning_rate": 6.203999999999999e-07, + "loss": 0.0072, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5694 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.5625, + "epoch": 0.7593333333333333, + "grad_norm": 5.16783332824707, + "kl": 0.136962890625, + "learning_rate": 6.203333333333333e-07, + "loss": 0.0055, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5695 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.8125, + "epoch": 0.7594666666666666, + "grad_norm": 5.850629806518555, + "kl": 0.197265625, + "learning_rate": 6.202666666666666e-07, + "loss": 0.0079, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5696 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.375, + "epoch": 0.7596, + "grad_norm": 6.5245513916015625, + "kl": 0.20849609375, + "learning_rate": 6.201999999999999e-07, + "loss": 0.0083, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5697 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.75, + "epoch": 0.7597333333333334, + "grad_norm": 14.812931060791016, + "kl": 0.23974609375, + "learning_rate": 6.201333333333333e-07, + "loss": 0.0096, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5698 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.75, + "epoch": 0.7598666666666667, + "grad_norm": 4.731354713439941, + "kl": 0.1728515625, + "learning_rate": 6.200666666666666e-07, + "loss": 0.0069, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 5699 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.8125, + "epoch": 0.76, + "grad_norm": 5.596025466918945, + "kl": 0.22705078125, + "learning_rate": 6.2e-07, + "loss": 0.0091, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 5700 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.75, + "epoch": 0.7601333333333333, + "grad_norm": 14.631034851074219, + "kl": 0.5712890625, + "learning_rate": 6.199333333333333e-07, + "loss": 0.0228, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5701 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.375, + "epoch": 0.7602666666666666, + "grad_norm": 9.098596572875977, + "kl": 0.20556640625, + "learning_rate": 6.198666666666667e-07, + "loss": 0.0082, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5702 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.8125, + "epoch": 0.7604, + "grad_norm": 7.84731388092041, + "kl": 0.18408203125, + "learning_rate": 6.198e-07, + "loss": 0.0074, + "reward": 1.5, + "reward_std": 0.7168372869491577, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 5703 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.25, + "epoch": 0.7605333333333333, + "grad_norm": 5.475016117095947, + "kl": 0.20068359375, + "learning_rate": 6.197333333333334e-07, + "loss": 0.008, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5704 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.0, + "epoch": 0.7606666666666667, + "grad_norm": 7.159987449645996, + "kl": 0.17333984375, + "learning_rate": 6.196666666666667e-07, + "loss": 0.0069, + "reward": 1.125, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 5705 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.25, + "epoch": 0.7608, + "grad_norm": 5.596301555633545, + "kl": 0.15087890625, + "learning_rate": 6.196e-07, + "loss": 0.006, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5706 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.5625, + "epoch": 0.7609333333333334, + "grad_norm": 4.618723392486572, + "kl": 0.1689453125, + "learning_rate": 6.195333333333334e-07, + "loss": 0.0068, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 5707 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.75, + "epoch": 0.7610666666666667, + "grad_norm": 3.514296531677246, + "kl": 0.14501953125, + "learning_rate": 6.194666666666667e-07, + "loss": 0.0058, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5708 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.125, + "epoch": 0.7612, + "grad_norm": 3.9170916080474854, + "kl": 0.236328125, + "learning_rate": 6.194e-07, + "loss": 0.0094, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5709 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.3125, + "epoch": 0.7613333333333333, + "grad_norm": 9.166778564453125, + "kl": 0.16064453125, + "learning_rate": 6.193333333333332e-07, + "loss": 0.0064, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5710 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.1875, + "epoch": 0.7614666666666666, + "grad_norm": 18.29347038269043, + "kl": 0.19921875, + "learning_rate": 6.192666666666666e-07, + "loss": 0.008, + "reward": 1.5625, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.875, + "step": 5711 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.8125, + "epoch": 0.7616, + "grad_norm": 32.46543502807617, + "kl": 0.20458984375, + "learning_rate": 6.191999999999999e-07, + "loss": 0.0082, + "reward": 1.3125, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 5712 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.625, + "epoch": 0.7617333333333334, + "grad_norm": 4.164909839630127, + "kl": 0.13330078125, + "learning_rate": 6.191333333333333e-07, + "loss": 0.0054, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 5713 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.0, + "epoch": 0.7618666666666667, + "grad_norm": 7.252896785736084, + "kl": 0.16357421875, + "learning_rate": 6.190666666666666e-07, + "loss": 0.0065, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5714 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.5625, + "epoch": 0.762, + "grad_norm": 10.121085166931152, + "kl": 0.22314453125, + "learning_rate": 6.189999999999999e-07, + "loss": 0.0089, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5715 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.6875, + "epoch": 0.7621333333333333, + "grad_norm": 120.05931091308594, + "kl": 0.1865234375, + "learning_rate": 6.189333333333333e-07, + "loss": 0.0075, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5716 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.5625, + "epoch": 0.7622666666666666, + "grad_norm": 3.5886244773864746, + "kl": 0.1728515625, + "learning_rate": 6.188666666666666e-07, + "loss": 0.0069, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5717 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.875, + "epoch": 0.7624, + "grad_norm": 4.692261219024658, + "kl": 0.203125, + "learning_rate": 6.188e-07, + "loss": 0.0081, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5718 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.0625, + "epoch": 0.7625333333333333, + "grad_norm": 7.120339393615723, + "kl": 0.20166015625, + "learning_rate": 6.187333333333333e-07, + "loss": 0.0081, + "reward": 1.3125, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.8125, + "step": 5719 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.0625, + "epoch": 0.7626666666666667, + "grad_norm": 4.770634174346924, + "kl": 0.19091796875, + "learning_rate": 6.186666666666667e-07, + "loss": 0.0076, + "reward": 1.6875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 5720 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.8125, + "epoch": 0.7628, + "grad_norm": 5.549770355224609, + "kl": 0.149169921875, + "learning_rate": 6.186e-07, + "loss": 0.006, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5721 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.5, + "epoch": 0.7629333333333334, + "grad_norm": 5.93183708190918, + "kl": 0.132080078125, + "learning_rate": 6.185333333333334e-07, + "loss": 0.0053, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 5722 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.25, + "epoch": 0.7630666666666667, + "grad_norm": 4.994113922119141, + "kl": 0.2646484375, + "learning_rate": 6.184666666666667e-07, + "loss": 0.0106, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 5723 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.375, + "epoch": 0.7632, + "grad_norm": 6.194400310516357, + "kl": 0.21435546875, + "learning_rate": 6.183999999999999e-07, + "loss": 0.0086, + "reward": 1.375, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 5724 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.75, + "epoch": 0.7633333333333333, + "grad_norm": 5.616654872894287, + "kl": 0.18115234375, + "learning_rate": 6.183333333333333e-07, + "loss": 0.0072, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5725 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.875, + "epoch": 0.7634666666666666, + "grad_norm": 8.007363319396973, + "kl": 0.18896484375, + "learning_rate": 6.182666666666666e-07, + "loss": 0.0076, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5726 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.0625, + "epoch": 0.7636, + "grad_norm": 5.046943187713623, + "kl": 0.154296875, + "learning_rate": 6.182e-07, + "loss": 0.0062, + "reward": 1.3125, + "reward_std": 0.5876962244510651, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 5727 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.9375, + "epoch": 0.7637333333333334, + "grad_norm": 6.3326334953308105, + "kl": 0.16748046875, + "learning_rate": 6.181333333333333e-07, + "loss": 0.0067, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5728 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.125, + "epoch": 0.7638666666666667, + "grad_norm": 5.279118537902832, + "kl": 0.1904296875, + "learning_rate": 6.180666666666667e-07, + "loss": 0.0076, + "reward": 1.3125, + "reward_std": 0.6487165093421936, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.8125, + "step": 5729 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.5, + "epoch": 0.764, + "grad_norm": 5.246060371398926, + "kl": 0.1494140625, + "learning_rate": 6.18e-07, + "loss": 0.006, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5730 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.75, + "epoch": 0.7641333333333333, + "grad_norm": 6.819084644317627, + "kl": 0.20361328125, + "learning_rate": 6.179333333333333e-07, + "loss": 0.0081, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 5731 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.8125, + "epoch": 0.7642666666666666, + "grad_norm": 7.185018539428711, + "kl": 0.26123046875, + "learning_rate": 6.178666666666666e-07, + "loss": 0.0104, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5732 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.9375, + "epoch": 0.7644, + "grad_norm": 3.964740753173828, + "kl": 0.127197265625, + "learning_rate": 6.178e-07, + "loss": 0.0051, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 5733 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.125, + "epoch": 0.7645333333333333, + "grad_norm": 6.41440486907959, + "kl": 0.224609375, + "learning_rate": 6.177333333333333e-07, + "loss": 0.009, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 5734 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.8125, + "epoch": 0.7646666666666667, + "grad_norm": 187.6031036376953, + "kl": 0.2216796875, + "learning_rate": 6.176666666666666e-07, + "loss": 0.0089, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5735 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.9375, + "epoch": 0.7648, + "grad_norm": 0.3129000663757324, + "kl": 0.197265625, + "learning_rate": 6.176e-07, + "loss": 0.0079, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5736 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.375, + "epoch": 0.7649333333333334, + "grad_norm": 9.193326950073242, + "kl": 0.154296875, + "learning_rate": 6.175333333333333e-07, + "loss": 0.0062, + "reward": 1.0, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 5737 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.25, + "epoch": 0.7650666666666667, + "grad_norm": 4.921482086181641, + "kl": 0.1337890625, + "learning_rate": 6.174666666666667e-07, + "loss": 0.0054, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5738 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.5625, + "epoch": 0.7652, + "grad_norm": 8.27313232421875, + "kl": 0.2392578125, + "learning_rate": 6.173999999999999e-07, + "loss": 0.0096, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5739 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.6875, + "epoch": 0.7653333333333333, + "grad_norm": 5.911664009094238, + "kl": 0.119140625, + "learning_rate": 6.173333333333333e-07, + "loss": 0.0048, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5740 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.625, + "epoch": 0.7654666666666666, + "grad_norm": 5.64099645614624, + "kl": 0.15673828125, + "learning_rate": 6.172666666666666e-07, + "loss": 0.0063, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5741 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.0, + "epoch": 0.7656, + "grad_norm": 5.291781902313232, + "kl": 0.13330078125, + "learning_rate": 6.172e-07, + "loss": 0.0053, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5742 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.5, + "epoch": 0.7657333333333334, + "grad_norm": 5.283191204071045, + "kl": 0.24462890625, + "learning_rate": 6.171333333333333e-07, + "loss": 0.0098, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5743 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.5, + "epoch": 0.7658666666666667, + "grad_norm": 130.27243041992188, + "kl": 0.20068359375, + "learning_rate": 6.170666666666666e-07, + "loss": 0.008, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 5744 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.0, + "epoch": 0.766, + "grad_norm": 6.7753987312316895, + "kl": 0.1494140625, + "learning_rate": 6.17e-07, + "loss": 0.006, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5745 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.5625, + "epoch": 0.7661333333333333, + "grad_norm": 5.890665054321289, + "kl": 0.142822265625, + "learning_rate": 6.169333333333333e-07, + "loss": 0.0057, + "reward": 1.125, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 5746 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.5, + "epoch": 0.7662666666666667, + "grad_norm": 5.564626693725586, + "kl": 0.14794921875, + "learning_rate": 6.168666666666667e-07, + "loss": 0.0059, + "reward": 1.375, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5747 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.8125, + "epoch": 0.7664, + "grad_norm": 6.273329734802246, + "kl": 0.14453125, + "learning_rate": 6.168e-07, + "loss": 0.0058, + "reward": 1.625, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 5748 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.4375, + "epoch": 0.7665333333333333, + "grad_norm": 15.013668060302734, + "kl": 0.20361328125, + "learning_rate": 6.167333333333334e-07, + "loss": 0.0082, + "reward": 1.5625, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5749 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.5625, + "epoch": 0.7666666666666667, + "grad_norm": 9.914219856262207, + "kl": 0.1611328125, + "learning_rate": 6.166666666666667e-07, + "loss": 0.0065, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5750 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.9375, + "epoch": 0.7668, + "grad_norm": 0.4633583426475525, + "kl": 0.15478515625, + "learning_rate": 6.166000000000001e-07, + "loss": 0.0062, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 5751 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.875, + "epoch": 0.7669333333333334, + "grad_norm": 4.340129852294922, + "kl": 0.126220703125, + "learning_rate": 6.165333333333333e-07, + "loss": 0.0051, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 5752 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.125, + "epoch": 0.7670666666666667, + "grad_norm": 6.885234832763672, + "kl": 0.2265625, + "learning_rate": 6.164666666666666e-07, + "loss": 0.0091, + "reward": 1.125, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 5753 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.0, + "epoch": 0.7672, + "grad_norm": 13.135144233703613, + "kl": 0.39306640625, + "learning_rate": 6.163999999999999e-07, + "loss": 0.0158, + "reward": 1.6875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 5754 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.5, + "epoch": 0.7673333333333333, + "grad_norm": 9.251069068908691, + "kl": 0.167236328125, + "learning_rate": 6.163333333333332e-07, + "loss": 0.0067, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5755 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.1875, + "epoch": 0.7674666666666666, + "grad_norm": 6.094717979431152, + "kl": 0.1806640625, + "learning_rate": 6.162666666666666e-07, + "loss": 0.0072, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5756 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.875, + "epoch": 0.7676, + "grad_norm": 7.5710296630859375, + "kl": 0.18505859375, + "learning_rate": 6.161999999999999e-07, + "loss": 0.0074, + "reward": 1.1875, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9375, + "step": 5757 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.3125, + "epoch": 0.7677333333333334, + "grad_norm": 5.364052772521973, + "kl": 0.121826171875, + "learning_rate": 6.161333333333333e-07, + "loss": 0.0049, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5758 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.6875, + "epoch": 0.7678666666666667, + "grad_norm": 6.246182918548584, + "kl": 0.17919921875, + "learning_rate": 6.160666666666666e-07, + "loss": 0.0072, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 5759 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.0, + "epoch": 0.768, + "grad_norm": 4.7720232009887695, + "kl": 0.146484375, + "learning_rate": 6.16e-07, + "loss": 0.0059, + "reward": 1.25, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5760 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.875, + "epoch": 0.7681333333333333, + "grad_norm": 4.14869499206543, + "kl": 0.1904296875, + "learning_rate": 6.159333333333333e-07, + "loss": 0.0076, + "reward": 1.6875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 5761 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.9375, + "epoch": 0.7682666666666667, + "grad_norm": 4.808554649353027, + "kl": 0.13037109375, + "learning_rate": 6.158666666666666e-07, + "loss": 0.0052, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5762 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.375, + "epoch": 0.7684, + "grad_norm": 8.222281455993652, + "kl": 0.2021484375, + "learning_rate": 6.158e-07, + "loss": 0.0081, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5763 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.625, + "epoch": 0.7685333333333333, + "grad_norm": 8.732898712158203, + "kl": 0.2197265625, + "learning_rate": 6.157333333333333e-07, + "loss": 0.0088, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5764 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.8125, + "epoch": 0.7686666666666667, + "grad_norm": 8.275306701660156, + "kl": 0.14111328125, + "learning_rate": 6.156666666666667e-07, + "loss": 0.0057, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5765 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.75, + "epoch": 0.7688, + "grad_norm": 9.236218452453613, + "kl": 0.129638671875, + "learning_rate": 6.156e-07, + "loss": 0.0052, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5766 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.0625, + "epoch": 0.7689333333333334, + "grad_norm": 4.924870014190674, + "kl": 0.13623046875, + "learning_rate": 6.155333333333334e-07, + "loss": 0.0055, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5767 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.1875, + "epoch": 0.7690666666666667, + "grad_norm": 6.512269973754883, + "kl": 0.169921875, + "learning_rate": 6.154666666666667e-07, + "loss": 0.0068, + "reward": 1.5, + "reward_std": 0.6746576428413391, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 5768 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.5, + "epoch": 0.7692, + "grad_norm": 9.625185012817383, + "kl": 0.36279296875, + "learning_rate": 6.154e-07, + "loss": 0.0145, + "reward": 1.625, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 5769 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.75, + "epoch": 0.7693333333333333, + "grad_norm": 8.318655967712402, + "kl": 0.2958984375, + "learning_rate": 6.153333333333333e-07, + "loss": 0.0118, + "reward": 1.375, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 5770 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.875, + "epoch": 0.7694666666666666, + "grad_norm": 4.6400675773620605, + "kl": 0.125, + "learning_rate": 6.152666666666666e-07, + "loss": 0.005, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5771 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.875, + "epoch": 0.7696, + "grad_norm": 7.180632591247559, + "kl": 0.18359375, + "learning_rate": 6.152e-07, + "loss": 0.0074, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 5772 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.375, + "epoch": 0.7697333333333334, + "grad_norm": 5.281545639038086, + "kl": 0.21142578125, + "learning_rate": 6.151333333333332e-07, + "loss": 0.0085, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5773 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.5, + "epoch": 0.7698666666666667, + "grad_norm": 3.2761077880859375, + "kl": 0.14453125, + "learning_rate": 6.150666666666666e-07, + "loss": 0.0058, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 5774 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.375, + "epoch": 0.77, + "grad_norm": 10.44161605834961, + "kl": 0.17919921875, + "learning_rate": 6.149999999999999e-07, + "loss": 0.0072, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 5775 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.5, + "epoch": 0.7701333333333333, + "grad_norm": 7.157438278198242, + "kl": 0.19775390625, + "learning_rate": 6.149333333333333e-07, + "loss": 0.0079, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5776 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.3125, + "epoch": 0.7702666666666667, + "grad_norm": 3.386967897415161, + "kl": 0.2021484375, + "learning_rate": 6.148666666666666e-07, + "loss": 0.0081, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5777 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.4375, + "epoch": 0.7704, + "grad_norm": 8.824287414550781, + "kl": 0.14453125, + "learning_rate": 6.148e-07, + "loss": 0.0058, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5778 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.875, + "epoch": 0.7705333333333333, + "grad_norm": 3.3216006755828857, + "kl": 0.17822265625, + "learning_rate": 6.147333333333333e-07, + "loss": 0.0071, + "reward": 1.0, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 5779 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.25, + "epoch": 0.7706666666666667, + "grad_norm": 6.530666351318359, + "kl": 0.2294921875, + "learning_rate": 6.146666666666667e-07, + "loss": 0.0092, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5780 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.8125, + "epoch": 0.7708, + "grad_norm": 8.490938186645508, + "kl": 0.15625, + "learning_rate": 6.146e-07, + "loss": 0.0062, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5781 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.875, + "epoch": 0.7709333333333334, + "grad_norm": 6.900384426116943, + "kl": 0.150146484375, + "learning_rate": 6.145333333333333e-07, + "loss": 0.006, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5782 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.625, + "epoch": 0.7710666666666667, + "grad_norm": 3.513000011444092, + "kl": 0.143310546875, + "learning_rate": 6.144666666666667e-07, + "loss": 0.0057, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5783 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.125, + "epoch": 0.7712, + "grad_norm": 0.35884878039360046, + "kl": 0.139404296875, + "learning_rate": 6.143999999999999e-07, + "loss": 0.0056, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5784 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.4375, + "epoch": 0.7713333333333333, + "grad_norm": 5.014357566833496, + "kl": 0.15478515625, + "learning_rate": 6.143333333333333e-07, + "loss": 0.0062, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5785 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.4375, + "epoch": 0.7714666666666666, + "grad_norm": 4.790518760681152, + "kl": 0.2314453125, + "learning_rate": 6.142666666666666e-07, + "loss": 0.0092, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5786 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.5, + "epoch": 0.7716, + "grad_norm": 6.444324970245361, + "kl": 0.22119140625, + "learning_rate": 6.142e-07, + "loss": 0.0089, + "reward": 1.6875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 5787 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.625, + "epoch": 0.7717333333333334, + "grad_norm": 4.365479946136475, + "kl": 0.166015625, + "learning_rate": 6.141333333333333e-07, + "loss": 0.0066, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5788 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.1875, + "epoch": 0.7718666666666667, + "grad_norm": 5.221035957336426, + "kl": 0.22119140625, + "learning_rate": 6.140666666666667e-07, + "loss": 0.0089, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 5789 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.375, + "epoch": 0.772, + "grad_norm": 6.547038555145264, + "kl": 0.16259765625, + "learning_rate": 6.14e-07, + "loss": 0.0065, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5790 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.625, + "epoch": 0.7721333333333333, + "grad_norm": 13.623315811157227, + "kl": 0.390625, + "learning_rate": 6.139333333333333e-07, + "loss": 0.0156, + "reward": 1.3125, + "reward_std": 0.7833450436592102, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.8125, + "step": 5791 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.0, + "epoch": 0.7722666666666667, + "grad_norm": 3.784558057785034, + "kl": 0.14208984375, + "learning_rate": 6.138666666666667e-07, + "loss": 0.0057, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5792 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.875, + "epoch": 0.7724, + "grad_norm": 6.874900817871094, + "kl": 0.16748046875, + "learning_rate": 6.138e-07, + "loss": 0.0067, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 5793 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.1875, + "epoch": 0.7725333333333333, + "grad_norm": 3.5564138889312744, + "kl": 0.3203125, + "learning_rate": 6.137333333333333e-07, + "loss": 0.0128, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5794 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.6875, + "epoch": 0.7726666666666666, + "grad_norm": 6.598103046417236, + "kl": 0.1875, + "learning_rate": 6.136666666666666e-07, + "loss": 0.0075, + "reward": 1.0, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.875, + "step": 5795 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.125, + "epoch": 0.7728, + "grad_norm": 7.761717796325684, + "kl": 0.15869140625, + "learning_rate": 6.136e-07, + "loss": 0.0063, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5796 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.0, + "epoch": 0.7729333333333334, + "grad_norm": 3.6142678260803223, + "kl": 0.17138671875, + "learning_rate": 6.135333333333333e-07, + "loss": 0.0069, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 5797 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.125, + "epoch": 0.7730666666666667, + "grad_norm": 3.3603837490081787, + "kl": 0.163330078125, + "learning_rate": 6.134666666666667e-07, + "loss": 0.0065, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5798 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.125, + "epoch": 0.7732, + "grad_norm": 7.4853315353393555, + "kl": 0.162109375, + "learning_rate": 6.133999999999999e-07, + "loss": 0.0065, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5799 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.0625, + "epoch": 0.7733333333333333, + "grad_norm": 6.3754119873046875, + "kl": 0.14111328125, + "learning_rate": 6.133333333333332e-07, + "loss": 0.0057, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5800 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.4375, + "epoch": 0.7734666666666666, + "grad_norm": 5.265937805175781, + "kl": 0.1357421875, + "learning_rate": 6.132666666666666e-07, + "loss": 0.0054, + "reward": 1.25, + "reward_std": 0.579209566116333, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.9375, + "step": 5801 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.8125, + "epoch": 0.7736, + "grad_norm": 4.604977130889893, + "kl": 0.18798828125, + "learning_rate": 6.131999999999999e-07, + "loss": 0.0075, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5802 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.75, + "epoch": 0.7737333333333334, + "grad_norm": 7.707676410675049, + "kl": 0.1552734375, + "learning_rate": 6.131333333333333e-07, + "loss": 0.0062, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5803 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.5, + "epoch": 0.7738666666666667, + "grad_norm": 7.834709644317627, + "kl": 0.28125, + "learning_rate": 6.130666666666666e-07, + "loss": 0.0112, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5804 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.9375, + "epoch": 0.774, + "grad_norm": 3.798572301864624, + "kl": 0.1474609375, + "learning_rate": 6.13e-07, + "loss": 0.0059, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5805 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.125, + "epoch": 0.7741333333333333, + "grad_norm": 6.102238178253174, + "kl": 0.1328125, + "learning_rate": 6.129333333333333e-07, + "loss": 0.0053, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5806 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.9375, + "epoch": 0.7742666666666667, + "grad_norm": 5.355408191680908, + "kl": 0.12939453125, + "learning_rate": 6.128666666666667e-07, + "loss": 0.0052, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5807 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.8125, + "epoch": 0.7744, + "grad_norm": 5.634716033935547, + "kl": 0.126708984375, + "learning_rate": 6.128e-07, + "loss": 0.0051, + "reward": 1.6875, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5808 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.875, + "epoch": 0.7745333333333333, + "grad_norm": 7.078091144561768, + "kl": 0.1572265625, + "learning_rate": 6.127333333333333e-07, + "loss": 0.0063, + "reward": 1.1875, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 5809 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.5625, + "epoch": 0.7746666666666666, + "grad_norm": 5.41220235824585, + "kl": 0.16162109375, + "learning_rate": 6.126666666666667e-07, + "loss": 0.0065, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5810 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.9375, + "epoch": 0.7748, + "grad_norm": 3.6103463172912598, + "kl": 0.101318359375, + "learning_rate": 6.126e-07, + "loss": 0.0041, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5811 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.125, + "epoch": 0.7749333333333334, + "grad_norm": 6.918585300445557, + "kl": 0.1328125, + "learning_rate": 6.125333333333334e-07, + "loss": 0.0053, + "reward": 1.625, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5812 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.0625, + "epoch": 0.7750666666666667, + "grad_norm": 6.09077262878418, + "kl": 0.14111328125, + "learning_rate": 6.124666666666667e-07, + "loss": 0.0056, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5813 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.5, + "epoch": 0.7752, + "grad_norm": 22.706722259521484, + "kl": 0.21533203125, + "learning_rate": 6.124000000000001e-07, + "loss": 0.0086, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5814 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.4375, + "epoch": 0.7753333333333333, + "grad_norm": 7.65964937210083, + "kl": 0.205078125, + "learning_rate": 6.123333333333332e-07, + "loss": 0.0082, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5815 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.3125, + "epoch": 0.7754666666666666, + "grad_norm": 15.51512336730957, + "kl": 0.6220703125, + "learning_rate": 6.122666666666666e-07, + "loss": 0.0251, + "reward": 1.625, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 5816 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.25, + "epoch": 0.7756, + "grad_norm": 7.854813098907471, + "kl": 0.171875, + "learning_rate": 6.121999999999999e-07, + "loss": 0.0069, + "reward": 1.4375, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5817 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.5, + "epoch": 0.7757333333333334, + "grad_norm": 6.727121829986572, + "kl": 0.173828125, + "learning_rate": 6.121333333333332e-07, + "loss": 0.0069, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5818 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.375, + "epoch": 0.7758666666666667, + "grad_norm": 1.515790581703186, + "kl": 0.2177734375, + "learning_rate": 6.120666666666666e-07, + "loss": 0.0087, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5819 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.875, + "epoch": 0.776, + "grad_norm": 6.293679237365723, + "kl": 0.2490234375, + "learning_rate": 6.119999999999999e-07, + "loss": 0.01, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5820 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.0, + "epoch": 0.7761333333333333, + "grad_norm": 7.59114408493042, + "kl": 0.234375, + "learning_rate": 6.119333333333333e-07, + "loss": 0.0094, + "reward": 1.4375, + "reward_std": 0.7255652844905853, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.875, + "step": 5821 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.125, + "epoch": 0.7762666666666667, + "grad_norm": 8.376145362854004, + "kl": 0.2958984375, + "learning_rate": 6.118666666666666e-07, + "loss": 0.0118, + "reward": 1.125, + "reward_std": 0.7376964688301086, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.8125, + "step": 5822 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.125, + "epoch": 0.7764, + "grad_norm": 5.434666633605957, + "kl": 0.21044921875, + "learning_rate": 6.118e-07, + "loss": 0.0084, + "reward": 1.625, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 5823 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.4375, + "epoch": 0.7765333333333333, + "grad_norm": 7.865795135498047, + "kl": 0.17529296875, + "learning_rate": 6.117333333333333e-07, + "loss": 0.007, + "reward": 1.375, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5824 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.4375, + "epoch": 0.7766666666666666, + "grad_norm": 5.9583635330200195, + "kl": 0.21728515625, + "learning_rate": 6.116666666666667e-07, + "loss": 0.0087, + "reward": 1.0625, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 5825 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.6875, + "epoch": 0.7768, + "grad_norm": 6.774709701538086, + "kl": 0.19189453125, + "learning_rate": 6.116e-07, + "loss": 0.0077, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5826 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.5, + "epoch": 0.7769333333333334, + "grad_norm": 20.13853645324707, + "kl": 0.1533203125, + "learning_rate": 6.115333333333334e-07, + "loss": 0.0061, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5827 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.25, + "epoch": 0.7770666666666667, + "grad_norm": 8.355587005615234, + "kl": 0.19140625, + "learning_rate": 6.114666666666667e-07, + "loss": 0.0077, + "reward": 1.5625, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5828 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.8125, + "epoch": 0.7772, + "grad_norm": 6.030824661254883, + "kl": 0.17041015625, + "learning_rate": 6.114e-07, + "loss": 0.0068, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5829 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.3125, + "epoch": 0.7773333333333333, + "grad_norm": 4.581868648529053, + "kl": 0.15234375, + "learning_rate": 6.113333333333333e-07, + "loss": 0.0061, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 5830 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.9375, + "epoch": 0.7774666666666666, + "grad_norm": 5.474335670471191, + "kl": 0.23583984375, + "learning_rate": 6.112666666666666e-07, + "loss": 0.0095, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5831 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.3125, + "epoch": 0.7776, + "grad_norm": 4.882074356079102, + "kl": 0.26416015625, + "learning_rate": 6.112e-07, + "loss": 0.0106, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5832 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.375, + "epoch": 0.7777333333333334, + "grad_norm": 6.99795389175415, + "kl": 0.210693359375, + "learning_rate": 6.111333333333333e-07, + "loss": 0.0084, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5833 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.125, + "epoch": 0.7778666666666667, + "grad_norm": 4.927011013031006, + "kl": 0.15576171875, + "learning_rate": 6.110666666666667e-07, + "loss": 0.0062, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5834 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.625, + "epoch": 0.778, + "grad_norm": 0.2985342741012573, + "kl": 0.139404296875, + "learning_rate": 6.11e-07, + "loss": 0.0056, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5835 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.6875, + "epoch": 0.7781333333333333, + "grad_norm": 10.521323204040527, + "kl": 0.152587890625, + "learning_rate": 6.109333333333334e-07, + "loss": 0.0061, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 5836 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.25, + "epoch": 0.7782666666666667, + "grad_norm": 6.753071308135986, + "kl": 0.203125, + "learning_rate": 6.108666666666666e-07, + "loss": 0.0081, + "reward": 1.625, + "reward_std": 0.6943650841712952, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 5837 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.75, + "epoch": 0.7784, + "grad_norm": 17.054214477539062, + "kl": 0.162109375, + "learning_rate": 6.107999999999999e-07, + "loss": 0.0065, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 5838 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.4375, + "epoch": 0.7785333333333333, + "grad_norm": 5.825204372406006, + "kl": 0.16943359375, + "learning_rate": 6.107333333333333e-07, + "loss": 0.0068, + "reward": 1.25, + "reward_std": 0.6746576428413391, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 5839 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.5625, + "epoch": 0.7786666666666666, + "grad_norm": 6.127478122711182, + "kl": 0.15185546875, + "learning_rate": 6.106666666666666e-07, + "loss": 0.0061, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5840 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.5625, + "epoch": 0.7788, + "grad_norm": 13.475972175598145, + "kl": 0.3310546875, + "learning_rate": 6.106e-07, + "loss": 0.0132, + "reward": 1.5, + "reward_std": 0.6452257037162781, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5841 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.4375, + "epoch": 0.7789333333333334, + "grad_norm": 38.781124114990234, + "kl": 2.21044921875, + "learning_rate": 6.105333333333333e-07, + "loss": 0.0889, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5842 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.9375, + "epoch": 0.7790666666666667, + "grad_norm": 6.0517778396606445, + "kl": 0.22802734375, + "learning_rate": 6.104666666666667e-07, + "loss": 0.0091, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5843 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.0, + "epoch": 0.7792, + "grad_norm": 6.608054161071777, + "kl": 0.22314453125, + "learning_rate": 6.104e-07, + "loss": 0.0089, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9375, + "step": 5844 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.5, + "epoch": 0.7793333333333333, + "grad_norm": 6.568997859954834, + "kl": 0.263671875, + "learning_rate": 6.103333333333333e-07, + "loss": 0.0105, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5845 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.6875, + "epoch": 0.7794666666666666, + "grad_norm": 5.451408863067627, + "kl": 0.17431640625, + "learning_rate": 6.102666666666666e-07, + "loss": 0.007, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 5846 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.4375, + "epoch": 0.7796, + "grad_norm": 7.745421409606934, + "kl": 0.2060546875, + "learning_rate": 6.101999999999999e-07, + "loss": 0.0083, + "reward": 1.3125, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 5847 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.375, + "epoch": 0.7797333333333333, + "grad_norm": 0.3493758738040924, + "kl": 0.22314453125, + "learning_rate": 6.101333333333333e-07, + "loss": 0.0089, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5848 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.9375, + "epoch": 0.7798666666666667, + "grad_norm": 3.9053943157196045, + "kl": 0.25390625, + "learning_rate": 6.100666666666666e-07, + "loss": 0.0101, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5849 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.5, + "epoch": 0.78, + "grad_norm": 7.539649963378906, + "kl": 0.3271484375, + "learning_rate": 6.1e-07, + "loss": 0.0131, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 5850 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.875, + "epoch": 0.7801333333333333, + "grad_norm": 7.228489398956299, + "kl": 0.30615234375, + "learning_rate": 6.099333333333333e-07, + "loss": 0.0122, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 5851 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.75, + "epoch": 0.7802666666666667, + "grad_norm": 5.604074478149414, + "kl": 0.146484375, + "learning_rate": 6.098666666666667e-07, + "loss": 0.0059, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5852 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.75, + "epoch": 0.7804, + "grad_norm": 71.22688293457031, + "kl": 0.247802734375, + "learning_rate": 6.098e-07, + "loss": 0.0099, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5853 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.875, + "epoch": 0.7805333333333333, + "grad_norm": 4.998234272003174, + "kl": 0.15673828125, + "learning_rate": 6.097333333333334e-07, + "loss": 0.0063, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5854 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.9375, + "epoch": 0.7806666666666666, + "grad_norm": 10.799635887145996, + "kl": 0.35498046875, + "learning_rate": 6.096666666666667e-07, + "loss": 0.0142, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.875, + "step": 5855 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.6875, + "epoch": 0.7808, + "grad_norm": 6.320210933685303, + "kl": 0.1533203125, + "learning_rate": 6.096e-07, + "loss": 0.0061, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5856 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.8125, + "epoch": 0.7809333333333334, + "grad_norm": 0.4664636552333832, + "kl": 0.244140625, + "learning_rate": 6.095333333333334e-07, + "loss": 0.0098, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5857 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.5625, + "epoch": 0.7810666666666667, + "grad_norm": 6.225756645202637, + "kl": 0.1669921875, + "learning_rate": 6.094666666666666e-07, + "loss": 0.0067, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5858 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.25, + "epoch": 0.7812, + "grad_norm": 7.518535137176514, + "kl": 0.2353515625, + "learning_rate": 6.094e-07, + "loss": 0.0094, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5859 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.875, + "epoch": 0.7813333333333333, + "grad_norm": 6.527128219604492, + "kl": 0.21533203125, + "learning_rate": 6.093333333333332e-07, + "loss": 0.0086, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5860 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.4375, + "epoch": 0.7814666666666666, + "grad_norm": 0.4454389810562134, + "kl": 0.2587890625, + "learning_rate": 6.092666666666666e-07, + "loss": 0.0103, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5861 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.125, + "epoch": 0.7816, + "grad_norm": 6.956679344177246, + "kl": 0.115966796875, + "learning_rate": 6.091999999999999e-07, + "loss": 0.0046, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5862 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.5, + "epoch": 0.7817333333333333, + "grad_norm": 6.496237754821777, + "kl": 0.2197265625, + "learning_rate": 6.091333333333333e-07, + "loss": 0.0088, + "reward": 1.125, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.875, + "step": 5863 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.6875, + "epoch": 0.7818666666666667, + "grad_norm": 7.769206523895264, + "kl": 0.1748046875, + "learning_rate": 6.090666666666666e-07, + "loss": 0.007, + "reward": 1.75, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5864 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.4375, + "epoch": 0.782, + "grad_norm": 25.823152542114258, + "kl": 0.23876953125, + "learning_rate": 6.089999999999999e-07, + "loss": 0.0096, + "reward": 1.5625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5865 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.3125, + "epoch": 0.7821333333333333, + "grad_norm": 4.328371524810791, + "kl": 0.11376953125, + "learning_rate": 6.089333333333333e-07, + "loss": 0.0046, + "reward": 1.0625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 5866 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.375, + "epoch": 0.7822666666666667, + "grad_norm": 5.1936421394348145, + "kl": 0.1943359375, + "learning_rate": 6.088666666666666e-07, + "loss": 0.0078, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5867 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.125, + "epoch": 0.7824, + "grad_norm": 38.16549301147461, + "kl": 0.5966796875, + "learning_rate": 6.088e-07, + "loss": 0.0239, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9375, + "step": 5868 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.5625, + "epoch": 0.7825333333333333, + "grad_norm": 5.663207530975342, + "kl": 0.14697265625, + "learning_rate": 6.087333333333333e-07, + "loss": 0.0059, + "reward": 1.625, + "reward_std": 0.7315178513526917, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 5869 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.3125, + "epoch": 0.7826666666666666, + "grad_norm": 8.83761215209961, + "kl": 0.20166015625, + "learning_rate": 6.086666666666667e-07, + "loss": 0.0081, + "reward": 1.375, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5870 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.0625, + "epoch": 0.7828, + "grad_norm": 20.051380157470703, + "kl": 1.21044921875, + "learning_rate": 6.086e-07, + "loss": 0.0483, + "reward": 1.4375, + "reward_std": 0.5518900156021118, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5871 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.25, + "epoch": 0.7829333333333334, + "grad_norm": 5.979453086853027, + "kl": 0.130859375, + "learning_rate": 6.085333333333334e-07, + "loss": 0.0052, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5872 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.6875, + "epoch": 0.7830666666666667, + "grad_norm": 4.978777885437012, + "kl": 0.17138671875, + "learning_rate": 6.084666666666667e-07, + "loss": 0.0069, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 5873 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.6875, + "epoch": 0.7832, + "grad_norm": 6.504572868347168, + "kl": 0.1591796875, + "learning_rate": 6.084000000000001e-07, + "loss": 0.0064, + "reward": 1.1875, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 5874 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.0625, + "epoch": 0.7833333333333333, + "grad_norm": 0.36215323209762573, + "kl": 0.19140625, + "learning_rate": 6.083333333333333e-07, + "loss": 0.0077, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5875 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.8125, + "epoch": 0.7834666666666666, + "grad_norm": 3.0467958450317383, + "kl": 0.21923828125, + "learning_rate": 6.082666666666666e-07, + "loss": 0.0088, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5876 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.1875, + "epoch": 0.7836, + "grad_norm": 8.492506980895996, + "kl": 0.162109375, + "learning_rate": 6.082e-07, + "loss": 0.0065, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 5877 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.9375, + "epoch": 0.7837333333333333, + "grad_norm": 8.555720329284668, + "kl": 0.2421875, + "learning_rate": 6.081333333333332e-07, + "loss": 0.0097, + "reward": 1.25, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.875, + "step": 5878 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.125, + "epoch": 0.7838666666666667, + "grad_norm": 6.962512969970703, + "kl": 0.1865234375, + "learning_rate": 6.080666666666666e-07, + "loss": 0.0075, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5879 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.8125, + "epoch": 0.784, + "grad_norm": 6.703366756439209, + "kl": 0.24951171875, + "learning_rate": 6.079999999999999e-07, + "loss": 0.01, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5880 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.375, + "epoch": 0.7841333333333333, + "grad_norm": 6.85366153717041, + "kl": 0.1669921875, + "learning_rate": 6.079333333333333e-07, + "loss": 0.0067, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 5881 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.5625, + "epoch": 0.7842666666666667, + "grad_norm": 5.893631458282471, + "kl": 0.3701171875, + "learning_rate": 6.078666666666666e-07, + "loss": 0.0148, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.875, + "step": 5882 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.3125, + "epoch": 0.7844, + "grad_norm": 7.334555149078369, + "kl": 0.18798828125, + "learning_rate": 6.078e-07, + "loss": 0.0075, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5883 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.6875, + "epoch": 0.7845333333333333, + "grad_norm": 109.02449798583984, + "kl": 0.12939453125, + "learning_rate": 6.077333333333333e-07, + "loss": 0.0052, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5884 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.1875, + "epoch": 0.7846666666666666, + "grad_norm": 9.124953269958496, + "kl": 0.19580078125, + "learning_rate": 6.076666666666666e-07, + "loss": 0.0078, + "reward": 1.0625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9375, + "step": 5885 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.0625, + "epoch": 0.7848, + "grad_norm": 7.155543327331543, + "kl": 0.19580078125, + "learning_rate": 6.076e-07, + "loss": 0.0078, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5886 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.25, + "epoch": 0.7849333333333334, + "grad_norm": 5.9055352210998535, + "kl": 0.1689453125, + "learning_rate": 6.075333333333333e-07, + "loss": 0.0068, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5887 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.0, + "epoch": 0.7850666666666667, + "grad_norm": 17.314653396606445, + "kl": 0.1767578125, + "learning_rate": 6.074666666666667e-07, + "loss": 0.0071, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5888 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.5, + "epoch": 0.7852, + "grad_norm": 7.573689937591553, + "kl": 0.16552734375, + "learning_rate": 6.074e-07, + "loss": 0.0066, + "reward": 1.5625, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5889 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.5, + "epoch": 0.7853333333333333, + "grad_norm": 4.5128302574157715, + "kl": 0.1640625, + "learning_rate": 6.073333333333333e-07, + "loss": 0.0066, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 5890 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.125, + "epoch": 0.7854666666666666, + "grad_norm": 4.488035202026367, + "kl": 0.244140625, + "learning_rate": 6.072666666666666e-07, + "loss": 0.0098, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5891 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.625, + "epoch": 0.7856, + "grad_norm": 13.083112716674805, + "kl": 0.30224609375, + "learning_rate": 6.072e-07, + "loss": 0.0121, + "reward": 1.5625, + "reward_std": 0.6123279631137848, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5892 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.4375, + "epoch": 0.7857333333333333, + "grad_norm": 7.682229518890381, + "kl": 0.2177734375, + "learning_rate": 6.071333333333333e-07, + "loss": 0.0087, + "reward": 1.4375, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5893 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.375, + "epoch": 0.7858666666666667, + "grad_norm": 7.730988025665283, + "kl": 0.1630859375, + "learning_rate": 6.070666666666666e-07, + "loss": 0.0065, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5894 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.6875, + "epoch": 0.786, + "grad_norm": 7.429459571838379, + "kl": 0.19580078125, + "learning_rate": 6.07e-07, + "loss": 0.0078, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5895 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.25, + "epoch": 0.7861333333333334, + "grad_norm": 4.8645782470703125, + "kl": 0.1572265625, + "learning_rate": 6.069333333333333e-07, + "loss": 0.0063, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5896 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.1875, + "epoch": 0.7862666666666667, + "grad_norm": 5.346808910369873, + "kl": 0.2470703125, + "learning_rate": 6.068666666666667e-07, + "loss": 0.0099, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5897 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.0625, + "epoch": 0.7864, + "grad_norm": 6.782285690307617, + "kl": 0.147705078125, + "learning_rate": 6.068e-07, + "loss": 0.0059, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5898 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.3125, + "epoch": 0.7865333333333333, + "grad_norm": 5.920837879180908, + "kl": 0.192138671875, + "learning_rate": 6.067333333333334e-07, + "loss": 0.0077, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5899 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.1875, + "epoch": 0.7866666666666666, + "grad_norm": 0.7075868248939514, + "kl": 0.259765625, + "learning_rate": 6.066666666666666e-07, + "loss": 0.0104, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5900 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.125, + "epoch": 0.7868, + "grad_norm": 10.374283790588379, + "kl": 0.4931640625, + "learning_rate": 6.066e-07, + "loss": 0.0197, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5901 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.25, + "epoch": 0.7869333333333334, + "grad_norm": 7.367719650268555, + "kl": 0.20556640625, + "learning_rate": 6.065333333333333e-07, + "loss": 0.0082, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5902 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.3125, + "epoch": 0.7870666666666667, + "grad_norm": 0.7412270307540894, + "kl": 0.21337890625, + "learning_rate": 6.064666666666666e-07, + "loss": 0.0085, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5903 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.0, + "epoch": 0.7872, + "grad_norm": 49.46866989135742, + "kl": 0.171875, + "learning_rate": 6.064e-07, + "loss": 0.0069, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5904 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.75, + "epoch": 0.7873333333333333, + "grad_norm": 4.907528400421143, + "kl": 0.220703125, + "learning_rate": 6.063333333333332e-07, + "loss": 0.0088, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5905 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.0625, + "epoch": 0.7874666666666666, + "grad_norm": 4.889151573181152, + "kl": 0.20458984375, + "learning_rate": 6.062666666666666e-07, + "loss": 0.0082, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 5906 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.75, + "epoch": 0.7876, + "grad_norm": 0.43162912130355835, + "kl": 0.2880859375, + "learning_rate": 6.061999999999999e-07, + "loss": 0.0115, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5907 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.0, + "epoch": 0.7877333333333333, + "grad_norm": 5.469207286834717, + "kl": 0.1650390625, + "learning_rate": 6.061333333333333e-07, + "loss": 0.0066, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5908 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.8125, + "epoch": 0.7878666666666667, + "grad_norm": 8.74383544921875, + "kl": 0.220703125, + "learning_rate": 6.060666666666666e-07, + "loss": 0.0088, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 5909 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.75, + "epoch": 0.788, + "grad_norm": 11.00114631652832, + "kl": 0.18212890625, + "learning_rate": 6.06e-07, + "loss": 0.0073, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5910 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.9375, + "epoch": 0.7881333333333334, + "grad_norm": 5.861100196838379, + "kl": 0.21484375, + "learning_rate": 6.059333333333333e-07, + "loss": 0.0086, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5911 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.9375, + "epoch": 0.7882666666666667, + "grad_norm": 4.6017680168151855, + "kl": 0.1806640625, + "learning_rate": 6.058666666666666e-07, + "loss": 0.0072, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5912 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.3125, + "epoch": 0.7884, + "grad_norm": 6.293627738952637, + "kl": 0.154296875, + "learning_rate": 6.058e-07, + "loss": 0.0062, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 5913 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.8125, + "epoch": 0.7885333333333333, + "grad_norm": 5.459499835968018, + "kl": 0.1650390625, + "learning_rate": 6.057333333333333e-07, + "loss": 0.0066, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5914 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.3125, + "epoch": 0.7886666666666666, + "grad_norm": 7.359193801879883, + "kl": 0.17529296875, + "learning_rate": 6.056666666666667e-07, + "loss": 0.007, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5915 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.625, + "epoch": 0.7888, + "grad_norm": 55.59626007080078, + "kl": 0.2451171875, + "learning_rate": 6.056e-07, + "loss": 0.0098, + "reward": 1.5, + "reward_std": 0.7440237998962402, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.875, + "step": 5916 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.9375, + "epoch": 0.7889333333333334, + "grad_norm": 6.685770034790039, + "kl": 0.19580078125, + "learning_rate": 6.055333333333334e-07, + "loss": 0.0078, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5917 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.875, + "epoch": 0.7890666666666667, + "grad_norm": 3.513814926147461, + "kl": 0.14208984375, + "learning_rate": 6.054666666666667e-07, + "loss": 0.0057, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5918 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.8125, + "epoch": 0.7892, + "grad_norm": 0.32148292660713196, + "kl": 0.154052734375, + "learning_rate": 6.054000000000001e-07, + "loss": 0.0062, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5919 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.0, + "epoch": 0.7893333333333333, + "grad_norm": 22.718324661254883, + "kl": 0.159423828125, + "learning_rate": 6.053333333333332e-07, + "loss": 0.0064, + "reward": 1.0, + "reward_std": 0.49721167981624603, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.875, + "step": 5920 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.0625, + "epoch": 0.7894666666666666, + "grad_norm": 8.920133590698242, + "kl": 0.2109375, + "learning_rate": 6.052666666666665e-07, + "loss": 0.0084, + "reward": 1.625, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5921 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.3125, + "epoch": 0.7896, + "grad_norm": 6.1828203201293945, + "kl": 0.181640625, + "learning_rate": 6.051999999999999e-07, + "loss": 0.0073, + "reward": 1.625, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 5922 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.8125, + "epoch": 0.7897333333333333, + "grad_norm": 5.3463287353515625, + "kl": 0.22314453125, + "learning_rate": 6.051333333333332e-07, + "loss": 0.0089, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 5923 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.8125, + "epoch": 0.7898666666666667, + "grad_norm": 5.635403633117676, + "kl": 0.1904296875, + "learning_rate": 6.050666666666666e-07, + "loss": 0.0076, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5924 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.625, + "epoch": 0.79, + "grad_norm": 5.608335971832275, + "kl": 0.1591796875, + "learning_rate": 6.049999999999999e-07, + "loss": 0.0064, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5925 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.6875, + "epoch": 0.7901333333333334, + "grad_norm": 6.197031497955322, + "kl": 0.149658203125, + "learning_rate": 6.049333333333333e-07, + "loss": 0.006, + "reward": 1.1875, + "reward_std": 0.554741159081459, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 5926 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.4375, + "epoch": 0.7902666666666667, + "grad_norm": 9.918911933898926, + "kl": 0.2890625, + "learning_rate": 6.048666666666666e-07, + "loss": 0.0116, + "reward": 1.3125, + "reward_std": 0.6392731368541718, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 5927 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.1875, + "epoch": 0.7904, + "grad_norm": 35.699745178222656, + "kl": 0.376953125, + "learning_rate": 6.048e-07, + "loss": 0.0151, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 5928 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.375, + "epoch": 0.7905333333333333, + "grad_norm": 3.968254566192627, + "kl": 0.140625, + "learning_rate": 6.047333333333333e-07, + "loss": 0.0056, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5929 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.25, + "epoch": 0.7906666666666666, + "grad_norm": 7.51369047164917, + "kl": 0.1796875, + "learning_rate": 6.046666666666667e-07, + "loss": 0.0072, + "reward": 1.375, + "reward_std": 0.5487885922193527, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5930 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.5625, + "epoch": 0.7908, + "grad_norm": 6.599170684814453, + "kl": 0.19580078125, + "learning_rate": 6.046e-07, + "loss": 0.0078, + "reward": 1.625, + "reward_std": 0.6094194948673248, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 5931 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.25, + "epoch": 0.7909333333333334, + "grad_norm": 11.627262115478516, + "kl": 0.26708984375, + "learning_rate": 6.045333333333333e-07, + "loss": 0.0107, + "reward": 1.5, + "reward_std": 0.5850084125995636, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5932 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.8125, + "epoch": 0.7910666666666667, + "grad_norm": 1.0238374471664429, + "kl": 0.3046875, + "learning_rate": 6.044666666666667e-07, + "loss": 0.0122, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5933 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.9375, + "epoch": 0.7912, + "grad_norm": 4.394799709320068, + "kl": 0.14306640625, + "learning_rate": 6.044e-07, + "loss": 0.0057, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5934 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.8125, + "epoch": 0.7913333333333333, + "grad_norm": 23.240503311157227, + "kl": 0.20458984375, + "learning_rate": 6.043333333333333e-07, + "loss": 0.0082, + "reward": 1.625, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 5935 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.6875, + "epoch": 0.7914666666666667, + "grad_norm": 20.972469329833984, + "kl": 0.1787109375, + "learning_rate": 6.042666666666666e-07, + "loss": 0.0071, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5936 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.875, + "epoch": 0.7916, + "grad_norm": 6.882827281951904, + "kl": 0.16748046875, + "learning_rate": 6.042e-07, + "loss": 0.0067, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5937 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.375, + "epoch": 0.7917333333333333, + "grad_norm": 0.6939585208892822, + "kl": 0.212890625, + "learning_rate": 6.041333333333333e-07, + "loss": 0.0085, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5938 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.5, + "epoch": 0.7918666666666667, + "grad_norm": 6.51461935043335, + "kl": 0.181640625, + "learning_rate": 6.040666666666667e-07, + "loss": 0.0073, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5939 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.9375, + "epoch": 0.792, + "grad_norm": 6.07952356338501, + "kl": 0.2265625, + "learning_rate": 6.04e-07, + "loss": 0.0091, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.875, + "step": 5940 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.6875, + "epoch": 0.7921333333333334, + "grad_norm": 8.50638484954834, + "kl": 0.1826171875, + "learning_rate": 6.039333333333332e-07, + "loss": 0.0073, + "reward": 1.375, + "reward_std": 0.49871626496315, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5941 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.5625, + "epoch": 0.7922666666666667, + "grad_norm": 53.23040008544922, + "kl": 0.15771484375, + "learning_rate": 6.038666666666666e-07, + "loss": 0.0063, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5942 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.875, + "epoch": 0.7924, + "grad_norm": 6.007351398468018, + "kl": 0.23388671875, + "learning_rate": 6.037999999999999e-07, + "loss": 0.0094, + "reward": 1.5, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5943 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.3125, + "epoch": 0.7925333333333333, + "grad_norm": 4.885415554046631, + "kl": 0.18994140625, + "learning_rate": 6.037333333333333e-07, + "loss": 0.0076, + "reward": 1.1875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 5944 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.875, + "epoch": 0.7926666666666666, + "grad_norm": 6.459843158721924, + "kl": 0.151611328125, + "learning_rate": 6.036666666666666e-07, + "loss": 0.0061, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 5945 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.5, + "epoch": 0.7928, + "grad_norm": 7.787568092346191, + "kl": 0.236328125, + "learning_rate": 6.036e-07, + "loss": 0.0094, + "reward": 1.6875, + "reward_std": 0.6034669280052185, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.875, + "step": 5946 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.25, + "epoch": 0.7929333333333334, + "grad_norm": 4.939608573913574, + "kl": 0.3349609375, + "learning_rate": 6.035333333333333e-07, + "loss": 0.0134, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5947 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.6875, + "epoch": 0.7930666666666667, + "grad_norm": 5.220940589904785, + "kl": 0.122802734375, + "learning_rate": 6.034666666666667e-07, + "loss": 0.0049, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5948 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.3125, + "epoch": 0.7932, + "grad_norm": 6.887910842895508, + "kl": 0.18505859375, + "learning_rate": 6.034e-07, + "loss": 0.0074, + "reward": 1.125, + "reward_std": 0.6208146214485168, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.875, + "step": 5949 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.5625, + "epoch": 0.7933333333333333, + "grad_norm": 13.470952033996582, + "kl": 0.57666015625, + "learning_rate": 6.033333333333333e-07, + "loss": 0.0231, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.875, + "step": 5950 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.5625, + "epoch": 0.7934666666666667, + "grad_norm": 0.2876306474208832, + "kl": 0.1708984375, + "learning_rate": 6.032666666666666e-07, + "loss": 0.0068, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5951 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.6875, + "epoch": 0.7936, + "grad_norm": 4.591121673583984, + "kl": 0.17236328125, + "learning_rate": 6.031999999999999e-07, + "loss": 0.0069, + "reward": 1.6875, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9375, + "step": 5952 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.5, + "epoch": 0.7937333333333333, + "grad_norm": 6.207591533660889, + "kl": 0.16650390625, + "learning_rate": 6.031333333333333e-07, + "loss": 0.0067, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5953 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.0, + "epoch": 0.7938666666666667, + "grad_norm": 3.9752557277679443, + "kl": 0.16455078125, + "learning_rate": 6.030666666666666e-07, + "loss": 0.0066, + "reward": 1.5625, + "reward_std": 0.3204349875450134, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9375, + "step": 5954 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.125, + "epoch": 0.794, + "grad_norm": 9.055590629577637, + "kl": 0.23828125, + "learning_rate": 6.03e-07, + "loss": 0.0096, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5955 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.0625, + "epoch": 0.7941333333333334, + "grad_norm": 13.403589248657227, + "kl": 0.2939453125, + "learning_rate": 6.029333333333333e-07, + "loss": 0.0118, + "reward": 1.5, + "reward_std": 0.5175491571426392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5956 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.0625, + "epoch": 0.7942666666666667, + "grad_norm": 6.021942615509033, + "kl": 0.2255859375, + "learning_rate": 6.028666666666667e-07, + "loss": 0.009, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5957 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.25, + "epoch": 0.7944, + "grad_norm": 6.945180416107178, + "kl": 0.222900390625, + "learning_rate": 6.028e-07, + "loss": 0.0089, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5958 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.3125, + "epoch": 0.7945333333333333, + "grad_norm": 10.691861152648926, + "kl": 0.22705078125, + "learning_rate": 6.027333333333333e-07, + "loss": 0.0091, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 5959 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.25, + "epoch": 0.7946666666666666, + "grad_norm": 3.368602752685547, + "kl": 0.1787109375, + "learning_rate": 6.026666666666667e-07, + "loss": 0.0071, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 5960 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.3125, + "epoch": 0.7948, + "grad_norm": 68.48944854736328, + "kl": 0.28955078125, + "learning_rate": 6.026e-07, + "loss": 0.0116, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.875, + "step": 5961 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.6875, + "epoch": 0.7949333333333334, + "grad_norm": 5.07491397857666, + "kl": 0.3330078125, + "learning_rate": 6.025333333333334e-07, + "loss": 0.0133, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5962 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.25, + "epoch": 0.7950666666666667, + "grad_norm": 6.884893417358398, + "kl": 0.20068359375, + "learning_rate": 6.024666666666666e-07, + "loss": 0.008, + "reward": 1.3125, + "reward_std": 0.44403792917728424, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5963 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.8125, + "epoch": 0.7952, + "grad_norm": 6.123778820037842, + "kl": 0.17626953125, + "learning_rate": 6.024e-07, + "loss": 0.007, + "reward": 1.6875, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5964 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.375, + "epoch": 0.7953333333333333, + "grad_norm": 18.50387954711914, + "kl": 0.3359375, + "learning_rate": 6.023333333333333e-07, + "loss": 0.0134, + "reward": 1.75, + "reward_std": 0.4355512708425522, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5965 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.75, + "epoch": 0.7954666666666667, + "grad_norm": 7.295956134796143, + "kl": 0.4599609375, + "learning_rate": 6.022666666666666e-07, + "loss": 0.0184, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 5966 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.625, + "epoch": 0.7956, + "grad_norm": 8.371148109436035, + "kl": 0.21630859375, + "learning_rate": 6.021999999999999e-07, + "loss": 0.0087, + "reward": 1.5, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9375, + "step": 5967 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.75, + "epoch": 0.7957333333333333, + "grad_norm": 4.4917707443237305, + "kl": 0.14453125, + "learning_rate": 6.021333333333332e-07, + "loss": 0.0058, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5968 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.3125, + "epoch": 0.7958666666666666, + "grad_norm": 13.902915954589844, + "kl": 0.369140625, + "learning_rate": 6.020666666666666e-07, + "loss": 0.0147, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 5969 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.8125, + "epoch": 0.796, + "grad_norm": 30.82002830505371, + "kl": 0.238525390625, + "learning_rate": 6.019999999999999e-07, + "loss": 0.0096, + "reward": 1.5625, + "reward_std": 0.5260358154773712, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5970 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.625, + "epoch": 0.7961333333333334, + "grad_norm": 5.089845657348633, + "kl": 0.2041015625, + "learning_rate": 6.019333333333333e-07, + "loss": 0.0082, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5971 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.4375, + "epoch": 0.7962666666666667, + "grad_norm": 9.128155708312988, + "kl": 0.19873046875, + "learning_rate": 6.018666666666666e-07, + "loss": 0.0079, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5972 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.4375, + "epoch": 0.7964, + "grad_norm": 7.65203857421875, + "kl": 0.3994140625, + "learning_rate": 6.018e-07, + "loss": 0.016, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5973 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.4375, + "epoch": 0.7965333333333333, + "grad_norm": 12.765548706054688, + "kl": 0.255859375, + "learning_rate": 6.017333333333333e-07, + "loss": 0.0103, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5974 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.75, + "epoch": 0.7966666666666666, + "grad_norm": 4.2428693771362305, + "kl": 0.1884765625, + "learning_rate": 6.016666666666667e-07, + "loss": 0.0075, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5975 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.6875, + "epoch": 0.7968, + "grad_norm": 6.472564220428467, + "kl": 0.3369140625, + "learning_rate": 6.016e-07, + "loss": 0.0135, + "reward": 1.3125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.875, + "step": 5976 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.25, + "epoch": 0.7969333333333334, + "grad_norm": 0.5934532880783081, + "kl": 0.265625, + "learning_rate": 6.015333333333334e-07, + "loss": 0.0106, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5977 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.8125, + "epoch": 0.7970666666666667, + "grad_norm": 4.464727878570557, + "kl": 0.265625, + "learning_rate": 6.014666666666667e-07, + "loss": 0.0106, + "reward": 1.125, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 5978 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.0, + "epoch": 0.7972, + "grad_norm": 8.789402961730957, + "kl": 0.2265625, + "learning_rate": 6.014e-07, + "loss": 0.0091, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5979 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.8125, + "epoch": 0.7973333333333333, + "grad_norm": 9.818160057067871, + "kl": 0.283203125, + "learning_rate": 6.013333333333334e-07, + "loss": 0.0113, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9375, + "step": 5980 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.9375, + "epoch": 0.7974666666666667, + "grad_norm": 4.737703323364258, + "kl": 0.18896484375, + "learning_rate": 6.012666666666666e-07, + "loss": 0.0076, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5981 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.25, + "epoch": 0.7976, + "grad_norm": 3.8230624198913574, + "kl": 0.19482421875, + "learning_rate": 6.012e-07, + "loss": 0.0078, + "reward": 1.375, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5982 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.5625, + "epoch": 0.7977333333333333, + "grad_norm": 9.572763442993164, + "kl": 0.3505859375, + "learning_rate": 6.011333333333333e-07, + "loss": 0.014, + "reward": 1.375, + "reward_std": 0.6307864785194397, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 5983 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.625, + "epoch": 0.7978666666666666, + "grad_norm": 5.56707239151001, + "kl": 0.2578125, + "learning_rate": 6.010666666666666e-07, + "loss": 0.0103, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 5984 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.1875, + "epoch": 0.798, + "grad_norm": 0.3421449363231659, + "kl": 0.167724609375, + "learning_rate": 6.009999999999999e-07, + "loss": 0.0067, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5985 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.5625, + "epoch": 0.7981333333333334, + "grad_norm": 8.574501037597656, + "kl": 0.161376953125, + "learning_rate": 6.009333333333333e-07, + "loss": 0.0065, + "reward": 1.4375, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 5986 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.8125, + "epoch": 0.7982666666666667, + "grad_norm": 12.032696723937988, + "kl": 0.3017578125, + "learning_rate": 6.008666666666666e-07, + "loss": 0.012, + "reward": 1.8125, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 5987 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.125, + "epoch": 0.7984, + "grad_norm": 5.215874671936035, + "kl": 0.17822265625, + "learning_rate": 6.007999999999999e-07, + "loss": 0.0071, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5988 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.75, + "epoch": 0.7985333333333333, + "grad_norm": 3.6210389137268066, + "kl": 0.2490234375, + "learning_rate": 6.007333333333333e-07, + "loss": 0.01, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 5989 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.6875, + "epoch": 0.7986666666666666, + "grad_norm": 6.628279209136963, + "kl": 0.26416015625, + "learning_rate": 6.006666666666666e-07, + "loss": 0.0106, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5990 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.9375, + "epoch": 0.7988, + "grad_norm": 7.182236671447754, + "kl": 0.26220703125, + "learning_rate": 6.006e-07, + "loss": 0.0105, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 5991 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.5625, + "epoch": 0.7989333333333334, + "grad_norm": 0.37489861249923706, + "kl": 0.1728515625, + "learning_rate": 6.005333333333333e-07, + "loss": 0.0069, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 5992 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.75, + "epoch": 0.7990666666666667, + "grad_norm": 5.3225836753845215, + "kl": 0.23193359375, + "learning_rate": 6.004666666666667e-07, + "loss": 0.0093, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 5993 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.9375, + "epoch": 0.7992, + "grad_norm": 11.246674537658691, + "kl": 0.156494140625, + "learning_rate": 6.004e-07, + "loss": 0.0063, + "reward": 1.3125, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 5994 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.0625, + "epoch": 0.7993333333333333, + "grad_norm": 3.7861573696136475, + "kl": 0.1943359375, + "learning_rate": 6.003333333333334e-07, + "loss": 0.0078, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5995 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.1875, + "epoch": 0.7994666666666667, + "grad_norm": 14.580056190490723, + "kl": 0.837890625, + "learning_rate": 6.002666666666666e-07, + "loss": 0.0335, + "reward": 1.8125, + "reward_std": 0.5303300768136978, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9375, + "step": 5996 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.1875, + "epoch": 0.7996, + "grad_norm": 5.4283013343811035, + "kl": 0.32275390625, + "learning_rate": 6.001999999999999e-07, + "loss": 0.0129, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 5997 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.9375, + "epoch": 0.7997333333333333, + "grad_norm": 0.37234529852867126, + "kl": 0.22998046875, + "learning_rate": 6.001333333333333e-07, + "loss": 0.0092, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5998 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.1875, + "epoch": 0.7998666666666666, + "grad_norm": 0.7148362994194031, + "kl": 0.14794921875, + "learning_rate": 6.000666666666666e-07, + "loss": 0.0059, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 5999 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.0625, + "epoch": 0.8, + "grad_norm": 6.999229431152344, + "kl": 0.20849609375, + "learning_rate": 6e-07, + "loss": 0.0083, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9375, + "step": 6000 + } + ], + "logging_steps": 1.0, + "max_steps": 15000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 300, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}