Fix data preparation
Browse files- src/preparaing_recipe_nlg_dataset.py +8 -3
- src/run.sh +2 -1
src/preparaing_recipe_nlg_dataset.py
CHANGED
|
@@ -81,6 +81,7 @@ def main():
|
|
| 81 |
cache_dir=data_args.cache_dir
|
| 82 |
)
|
| 83 |
|
|
|
|
| 84 |
def cleaning(text, item_type="ner"):
|
| 85 |
# NOTE: DO THE CLEANING LATER
|
| 86 |
text = normalizer(text, do_lowercase=True)
|
|
@@ -92,9 +93,9 @@ def main():
|
|
| 92 |
ingredients = item_dict["ingredients"]
|
| 93 |
steps = item_dict["directions"]
|
| 94 |
|
| 95 |
-
condition_1 = filter_by_item(ner,
|
| 96 |
-
condition_2 = filter_by_length(title,
|
| 97 |
-
condition_3 = filter_by_item(ingredients,
|
| 98 |
condition_4 = filter_by_item(steps, 2)
|
| 99 |
condition_5 = filter_by_steps(" ".join(steps))
|
| 100 |
|
|
@@ -140,6 +141,10 @@ def main():
|
|
| 140 |
data_dict.append(item)
|
| 141 |
|
| 142 |
data_df = pd.DataFrame(data_dict)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
train, test = train_test_split(data_df, test_size=0.05, random_state=101)
|
| 144 |
|
| 145 |
train = train.reset_index(drop=True)
|
|
|
|
| 81 |
cache_dir=data_args.cache_dir
|
| 82 |
)
|
| 83 |
|
| 84 |
+
|
| 85 |
def cleaning(text, item_type="ner"):
|
| 86 |
# NOTE: DO THE CLEANING LATER
|
| 87 |
text = normalizer(text, do_lowercase=True)
|
|
|
|
| 93 |
ingredients = item_dict["ingredients"]
|
| 94 |
steps = item_dict["directions"]
|
| 95 |
|
| 96 |
+
condition_1 = filter_by_item(ner, 3)
|
| 97 |
+
condition_2 = filter_by_length(title, 3)
|
| 98 |
+
condition_3 = filter_by_item(ingredients, 3)
|
| 99 |
condition_4 = filter_by_item(steps, 2)
|
| 100 |
condition_5 = filter_by_steps(" ".join(steps))
|
| 101 |
|
|
|
|
| 141 |
data_dict.append(item)
|
| 142 |
|
| 143 |
data_df = pd.DataFrame(data_dict)
|
| 144 |
+
|
| 145 |
+
logger.info(f"Preparation - [before] consists of {len(dataset[subset])} records!")
|
| 146 |
+
logger.info(f"Preparation - [after] consists of {len(data_df)} records!")
|
| 147 |
+
|
| 148 |
train, test = train_test_split(data_df, test_size=0.05, random_state=101)
|
| 149 |
|
| 150 |
train = train.reset_index(drop=True)
|
src/run.sh
CHANGED
|
@@ -52,4 +52,5 @@ python run_ed_recipe_nlg.py \
|
|
| 52 |
--do_train \
|
| 53 |
--do_eval \
|
| 54 |
--overwrite_output_dir \
|
| 55 |
-
--predict_with_generate
|
|
|
|
|
|
| 52 |
--do_train \
|
| 53 |
--do_eval \
|
| 54 |
--overwrite_output_dir \
|
| 55 |
+
--predict_with_generate \
|
| 56 |
+
--push_to_hub
|