Spaces:
Sleeping
Sleeping
| # make_json.py (ํน์ build_videos_json.py) | |
| import json, os, re, random | |
| from typing import Optional | |
| from huggingface_hub import list_repo_files | |
| REPO = os.getenv("VIDEO_DATASET_REPO", "SGTLIM/ucf101_eval_unified") | |
| INCLUDE_FOLDERS = [s.strip() for s in os.getenv("VIDEO_INCLUDE_FOLDERS", "").split(",") if s.strip()] | |
| OUT_PATH = "videos.json" | |
| ADD_DOWNLOAD_PARAM = False # ์๊ฒฉ ์ฌ์ ๋ฌธ์ ์์ผ๋ฉด False ์ ์ง | |
| # 10๊ฐ ์ก์ ์ ํ์ค๋ช (๋์๋ฌธ์ ํฌํจ) | |
| ALLOWED_ACTIONS = { | |
| "BodyWeightSquats","HulaHoop","JumpingJack","PullUps","PushUps", | |
| "Shotput","SoccerJuggling","TennisSwing","ThrowDiscus","WallPushups", | |
| } | |
| # ํํ ์ฒ ์/๋์๋ฌธ์/์ธ๋์ค์ฝ์ด ๋ณํ โ ํ์ค๋ช ์ผ๋ก ์นํ | |
| ALIAS = { | |
| "bodyweightsquats":"BodyWeightSquats", | |
| "body_weight_squats":"BodyWeightSquats", | |
| "bodysquats":"BodyWeightSquats", | |
| "hulahoop":"HulaHoop", | |
| "jumpingjack":"JumpingJack", | |
| "pullups":"PullUps", | |
| "pushups":"PushUps", | |
| "shotput":"Shotput", | |
| "soccerjuggling":"SoccerJuggling", | |
| "soccer_juggling":"SoccerJuggling", | |
| "tennisswing":"TennisSwing", | |
| "tennis_swing":"TennisSwing", | |
| "throwdiscus":"ThrowDiscus", | |
| "throw_discus":"ThrowDiscus", | |
| "wallpushups":"WallPushups", | |
| "wall_pushups":"WallPushups", | |
| } | |
| def normalize_action(s: str) -> Optional[str]: | |
| key = re.sub(r"[^A-Za-z0-9]+", "_", s).lower().strip("_") | |
| canon = ALIAS.get(key) | |
| return canon if canon in ALLOWED_ACTIONS else None | |
| def is_index_token(tok: str) -> bool: | |
| return bool(re.fullmatch(r"\d{2,3}", tok)) | |
| def extract_action_from_id(path_in_repo: str) -> Optional[str]: | |
| """ | |
| <Model>_(...optional tokens...)_<Action>_<Index>_(hash).mp4 | |
| โ ์ค๋ฅธ์ชฝ๋ถํฐ ์ค์บํด ์ฒซ ์ซ์ ํ ํฐ(2~3์๋ฆฌ)์ 'Index'๋ก ๊ฐ์ฃผ, | |
| ๊ทธ ๋ฐ๋ก ์ ํ ํฐ์ ์ก์ ์ผ๋ก ์ฌ์ฉ. | |
| """ | |
| name = path_in_repo.rsplit("/", 1)[-1] | |
| stem = name.rsplit(".", 1)[0] | |
| toks = stem.split("_") | |
| # ์ค๋ฅธ์ชฝโ์ผ์ชฝ ๋ฐฉํฅ์ผ๋ก ์ซ์ ํ ํฐ์ ์ฐพ๋๋ค (์: 01, 10, 123) | |
| for i in range(len(toks)-1, -1, -1): | |
| t = toks[i] | |
| if re.fullmatch(r"\d{2,3}", t): | |
| if i - 1 >= 0: | |
| action = toks[i - 1] | |
| return normalize_action(action) | |
| break | |
| return None | |
| def main(): | |
| base_url = f"https://huggingface.co/datasets/{REPO}/resolve/main/" | |
| files = list_repo_files(repo_id=REPO, repo_type="dataset") | |
| mp4s = [f for f in files if f.lower().endswith(".mp4")] | |
| if INCLUDE_FOLDERS: | |
| mp4s = [f for f in mp4s if any(f.startswith(folder + "/") for folder in INCLUDE_FOLDERS)] | |
| videos, bad = [], [] | |
| for p in mp4s: | |
| action = extract_action_from_id(p) | |
| if action is None or action not in ALLOWED_ACTIONS: | |
| bad.append(p); continue | |
| url = base_url + p + ("?download=1" if ADD_DOWNLOAD_PARAM else "") | |
| videos.append({"url": url, "id": p, "action": action}) | |
| random.shuffle(videos) | |
| with open(OUT_PATH, "w", encoding="utf-8") as fp: | |
| json.dump(videos, fp, ensure_ascii=False, indent=2) | |
| print(f"[DONE] wrote {OUT_PATH} with {len(videos)} items from repo={REPO}") | |
| from collections import Counter | |
| c = Counter(v["action"] for v in videos) | |
| for a in sorted(ALLOWED_ACTIONS): | |
| print(f" {a:16s} = {c.get(a,0)}") | |
| if bad: | |
| print(f"[INFO] skipped (not matched to ALLOWED after normalize): {len(bad)}") | |
| for x in bad[:10]: | |
| print(" -", x) | |
| if __name__ == "__main__": | |
| main() | |