Spaces:
Sleeping
Sleeping
File size: 3,498 Bytes
f543cdc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
# make_json.py (ํน์ build_videos_json.py)
import json, os, re, random
from typing import Optional
from huggingface_hub import list_repo_files
REPO = os.getenv("VIDEO_DATASET_REPO", "SGTLIM/ucf101_eval_unified")
INCLUDE_FOLDERS = [s.strip() for s in os.getenv("VIDEO_INCLUDE_FOLDERS", "").split(",") if s.strip()]
OUT_PATH = "videos.json"
ADD_DOWNLOAD_PARAM = False # ์๊ฒฉ ์ฌ์ ๋ฌธ์ ์์ผ๋ฉด False ์ ์ง
# 10๊ฐ ์ก์
์ ํ์ค๋ช
(๋์๋ฌธ์ ํฌํจ)
ALLOWED_ACTIONS = {
"BodyWeightSquats","HulaHoop","JumpingJack","PullUps","PushUps",
"Shotput","SoccerJuggling","TennisSwing","ThrowDiscus","WallPushups",
}
# ํํ ์ฒ ์/๋์๋ฌธ์/์ธ๋์ค์ฝ์ด ๋ณํ โ ํ์ค๋ช
์ผ๋ก ์นํ
ALIAS = {
"bodyweightsquats":"BodyWeightSquats",
"body_weight_squats":"BodyWeightSquats",
"bodysquats":"BodyWeightSquats",
"hulahoop":"HulaHoop",
"jumpingjack":"JumpingJack",
"pullups":"PullUps",
"pushups":"PushUps",
"shotput":"Shotput",
"soccerjuggling":"SoccerJuggling",
"soccer_juggling":"SoccerJuggling",
"tennisswing":"TennisSwing",
"tennis_swing":"TennisSwing",
"throwdiscus":"ThrowDiscus",
"throw_discus":"ThrowDiscus",
"wallpushups":"WallPushups",
"wall_pushups":"WallPushups",
}
def normalize_action(s: str) -> Optional[str]:
key = re.sub(r"[^A-Za-z0-9]+", "_", s).lower().strip("_")
canon = ALIAS.get(key)
return canon if canon in ALLOWED_ACTIONS else None
def is_index_token(tok: str) -> bool:
return bool(re.fullmatch(r"\d{2,3}", tok))
def extract_action_from_id(path_in_repo: str) -> Optional[str]:
"""
<Model>_(...optional tokens...)_<Action>_<Index>_(hash).mp4
โ ์ค๋ฅธ์ชฝ๋ถํฐ ์ค์บํด ์ฒซ ์ซ์ ํ ํฐ(2~3์๋ฆฌ)์ 'Index'๋ก ๊ฐ์ฃผ,
๊ทธ ๋ฐ๋ก ์ ํ ํฐ์ ์ก์
์ผ๋ก ์ฌ์ฉ.
"""
name = path_in_repo.rsplit("/", 1)[-1]
stem = name.rsplit(".", 1)[0]
toks = stem.split("_")
# ์ค๋ฅธ์ชฝโ์ผ์ชฝ ๋ฐฉํฅ์ผ๋ก ์ซ์ ํ ํฐ์ ์ฐพ๋๋ค (์: 01, 10, 123)
for i in range(len(toks)-1, -1, -1):
t = toks[i]
if re.fullmatch(r"\d{2,3}", t):
if i - 1 >= 0:
action = toks[i - 1]
return normalize_action(action)
break
return None
def main():
base_url = f"https://huggingface.co/datasets/{REPO}/resolve/main/"
files = list_repo_files(repo_id=REPO, repo_type="dataset")
mp4s = [f for f in files if f.lower().endswith(".mp4")]
if INCLUDE_FOLDERS:
mp4s = [f for f in mp4s if any(f.startswith(folder + "/") for folder in INCLUDE_FOLDERS)]
videos, bad = [], []
for p in mp4s:
action = extract_action_from_id(p)
if action is None or action not in ALLOWED_ACTIONS:
bad.append(p); continue
url = base_url + p + ("?download=1" if ADD_DOWNLOAD_PARAM else "")
videos.append({"url": url, "id": p, "action": action})
random.shuffle(videos)
with open(OUT_PATH, "w", encoding="utf-8") as fp:
json.dump(videos, fp, ensure_ascii=False, indent=2)
print(f"[DONE] wrote {OUT_PATH} with {len(videos)} items from repo={REPO}")
from collections import Counter
c = Counter(v["action"] for v in videos)
for a in sorted(ALLOWED_ACTIONS):
print(f" {a:16s} = {c.get(a,0)}")
if bad:
print(f"[INFO] skipped (not matched to ALLOWED after normalize): {len(bad)}")
for x in bad[:10]:
print(" -", x)
if __name__ == "__main__":
main()
|