# make_json.py (혹은 build_videos_json.py) import json, os, re, random from typing import Optional from huggingface_hub import list_repo_files REPO = os.getenv("VIDEO_DATASET_REPO", "SGTLIM/ucf101_eval_unified") INCLUDE_FOLDERS = [s.strip() for s in os.getenv("VIDEO_INCLUDE_FOLDERS", "").split(",") if s.strip()] OUT_PATH = "videos.json" ADD_DOWNLOAD_PARAM = False # 원격 재생 문제 있으면 False 유지 # 10개 액션의 표준명(대소문자 포함) ALLOWED_ACTIONS = { "BodyWeightSquats","HulaHoop","JumpingJack","PullUps","PushUps", "Shotput","SoccerJuggling","TennisSwing","ThrowDiscus","WallPushups", } # 흔한 철자/대소문자/언더스코어 변형 → 표준명으로 치환 ALIAS = { "bodyweightsquats":"BodyWeightSquats", "body_weight_squats":"BodyWeightSquats", "bodysquats":"BodyWeightSquats", "hulahoop":"HulaHoop", "jumpingjack":"JumpingJack", "pullups":"PullUps", "pushups":"PushUps", "shotput":"Shotput", "soccerjuggling":"SoccerJuggling", "soccer_juggling":"SoccerJuggling", "tennisswing":"TennisSwing", "tennis_swing":"TennisSwing", "throwdiscus":"ThrowDiscus", "throw_discus":"ThrowDiscus", "wallpushups":"WallPushups", "wall_pushups":"WallPushups", } def normalize_action(s: str) -> Optional[str]: key = re.sub(r"[^A-Za-z0-9]+", "_", s).lower().strip("_") canon = ALIAS.get(key) return canon if canon in ALLOWED_ACTIONS else None def is_index_token(tok: str) -> bool: return bool(re.fullmatch(r"\d{2,3}", tok)) def extract_action_from_id(path_in_repo: str) -> Optional[str]: """ _(...optional tokens...)___(hash).mp4 → 오른쪽부터 스캔해 첫 숫자 토큰(2~3자리)을 'Index'로 간주, 그 바로 앞 토큰을 액션으로 사용. """ name = path_in_repo.rsplit("/", 1)[-1] stem = name.rsplit(".", 1)[0] toks = stem.split("_") # 오른쪽→왼쪽 방향으로 숫자 토큰을 찾는다 (예: 01, 10, 123) for i in range(len(toks)-1, -1, -1): t = toks[i] if re.fullmatch(r"\d{2,3}", t): if i - 1 >= 0: action = toks[i - 1] return normalize_action(action) break return None def main(): base_url = f"https://huggingface.co/datasets/{REPO}/resolve/main/" files = list_repo_files(repo_id=REPO, repo_type="dataset") mp4s = [f for f in files if f.lower().endswith(".mp4")] if INCLUDE_FOLDERS: mp4s = [f for f in mp4s if any(f.startswith(folder + "/") for folder in INCLUDE_FOLDERS)] videos, bad = [], [] for p in mp4s: action = extract_action_from_id(p) if action is None or action not in ALLOWED_ACTIONS: bad.append(p); continue url = base_url + p + ("?download=1" if ADD_DOWNLOAD_PARAM else "") videos.append({"url": url, "id": p, "action": action}) random.shuffle(videos) with open(OUT_PATH, "w", encoding="utf-8") as fp: json.dump(videos, fp, ensure_ascii=False, indent=2) print(f"[DONE] wrote {OUT_PATH} with {len(videos)} items from repo={REPO}") from collections import Counter c = Counter(v["action"] for v in videos) for a in sorted(ALLOWED_ACTIONS): print(f" {a:16s} = {c.get(a,0)}") if bad: print(f"[INFO] skipped (not matched to ALLOWED after normalize): {len(bad)}") for x in bad[:10]: print(" -", x) if __name__ == "__main__": main()