Fix script

Browse files

Files changed (2) hide show

src/run.sh +1 -1
src/{run_ed_recipe_nlg.py → run_recipe_nlg_flax.py} +14 -4

src/run.sh CHANGED Viewed

@@ -27,7 +27,7 @@ export LOGGING_STEPS=500
 export EVAL_STEPS=2500
 export SAVE_STEPS=2500
-python src/run_ed_recipe_nlg.py \
     --output_dir="$OUTPUT_DIR"  \
     --train_file="$TRAIN_FILE" \
     --validation_file="$VALIDATION_FILE" \

 export EVAL_STEPS=2500
 export SAVE_STEPS=2500
+python src/run_recipe_nlg_flax.py \
     --output_dir="$OUTPUT_DIR"  \
     --train_file="$TRAIN_FILE" \
     --validation_file="$VALIDATION_FILE" \

src/{run_ed_recipe_nlg.py → run_recipe_nlg_flax.py} RENAMED Viewed

@@ -779,7 +779,9 @@ def main():
                 # Save metrics
                 train_metric = unreplicate(train_metric)
                 train_time += time.time() - train_start
                 if has_tensorboard and jax.process_index() == 0:
                     write_train_metric(summary_writer, train_metrics, train_time, cur_step)
                 epochs.write(
@@ -789,6 +791,7 @@ def main():
                 train_metrics = []
             if cur_step % training_args.eval_steps == 0 and cur_step > 0 and training_args.do_eval:
                 eval_metrics = []
                 eval_preds = []
                 eval_labels = []
@@ -827,20 +830,27 @@ def main():
                 # Save metrics
                 if has_tensorboard and jax.process_index() == 0:
-                    cur_step = epoch * (len(train_dataset) // train_batch_size)
                     write_eval_metric(summary_writer, eval_metrics, cur_step)
             if cur_step % training_args.save_steps == 0 and cur_step > 0:
-                # save checkpoint after each epoch and push checkpoint to the hub
                 if jax.process_index() == 0:
-                    # params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
-                    params = jax.device_get(unreplicate(state.params))
                     model.save_pretrained(
                         training_args.output_dir,
                         params=params,
                         push_to_hub=training_args.push_to_hub,
                         commit_message=f"Saving weights and logs of step {cur_step}",
                     )
 if __name__ == "__main__":

                 # Save metrics
                 train_metric = unreplicate(train_metric)
                 train_time += time.time() - train_start
                 if has_tensorboard and jax.process_index() == 0:
+                    logger.info(f"*** Writing training summary after {cur_step} steps ***")
                     write_train_metric(summary_writer, train_metrics, train_time, cur_step)
                 epochs.write(
                 train_metrics = []
             if cur_step % training_args.eval_steps == 0 and cur_step > 0 and training_args.do_eval:
+                logger.info(f"*** Evaluation after {cur_step} steps ***")
                 eval_metrics = []
                 eval_preds = []
                 eval_labels = []
                 # Save metrics
                 if has_tensorboard and jax.process_index() == 0:
+                    logger.info(f"*** Writing evaluation summary after {cur_step} steps ***")
+                    # cur_step = epoch * (len(train_dataset) // train_batch_size)
                     write_eval_metric(summary_writer, eval_metrics, cur_step)
             if cur_step % training_args.save_steps == 0 and cur_step > 0:
+                logger.info(f"*** Saving checkpoints after {cur_step} steps ***")
+                # save checkpoint after each steps and push checkpoint to the hub
                 if jax.process_index() == 0:
+                    params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+                    # params = jax.device_get(unreplicate(state.params))
                     model.save_pretrained(
                         training_args.output_dir,
                         params=params,
                         push_to_hub=training_args.push_to_hub,
                         commit_message=f"Saving weights and logs of step {cur_step}",
                     )
+                    tokenizer.save_pretrained(
+                        training_args.output_dir,
+                        push_to_hub=training_args.push_to_hub,
+                        commit_message=f"Saving tokenizer step {cur_step}",
+                    )
 if __name__ == "__main__":