File size: 3,498 Bytes
f543cdc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# make_json.py (ํ˜น์€ build_videos_json.py)

import json, os, re, random
from typing import Optional
from huggingface_hub import list_repo_files

REPO = os.getenv("VIDEO_DATASET_REPO", "SGTLIM/ucf101_eval_unified")
INCLUDE_FOLDERS = [s.strip() for s in os.getenv("VIDEO_INCLUDE_FOLDERS", "").split(",") if s.strip()]
OUT_PATH = "videos.json"
ADD_DOWNLOAD_PARAM = False  # ์›๊ฒฉ ์žฌ์ƒ ๋ฌธ์ œ ์žˆ์œผ๋ฉด False ์œ ์ง€

# 10๊ฐœ ์•ก์…˜์˜ ํ‘œ์ค€๋ช…(๋Œ€์†Œ๋ฌธ์ž ํฌํ•จ)
ALLOWED_ACTIONS = {
    "BodyWeightSquats","HulaHoop","JumpingJack","PullUps","PushUps",
    "Shotput","SoccerJuggling","TennisSwing","ThrowDiscus","WallPushups",
}

# ํ”ํ•œ ์ฒ ์ž/๋Œ€์†Œ๋ฌธ์ž/์–ธ๋”์Šค์ฝ”์–ด ๋ณ€ํ˜• โ†’ ํ‘œ์ค€๋ช…์œผ๋กœ ์น˜ํ™˜
ALIAS = {
    "bodyweightsquats":"BodyWeightSquats",
    "body_weight_squats":"BodyWeightSquats",
    "bodysquats":"BodyWeightSquats",

    "hulahoop":"HulaHoop",

    "jumpingjack":"JumpingJack",

    "pullups":"PullUps",

    "pushups":"PushUps",

    "shotput":"Shotput",

    "soccerjuggling":"SoccerJuggling",
    "soccer_juggling":"SoccerJuggling",

    "tennisswing":"TennisSwing",
    "tennis_swing":"TennisSwing",

    "throwdiscus":"ThrowDiscus",
    "throw_discus":"ThrowDiscus",

    "wallpushups":"WallPushups",
    "wall_pushups":"WallPushups",
}

def normalize_action(s: str) -> Optional[str]:
    key = re.sub(r"[^A-Za-z0-9]+", "_", s).lower().strip("_")
    canon = ALIAS.get(key)
    return canon if canon in ALLOWED_ACTIONS else None

def is_index_token(tok: str) -> bool:
    return bool(re.fullmatch(r"\d{2,3}", tok))

def extract_action_from_id(path_in_repo: str) -> Optional[str]:
    """
    <Model>_(...optional tokens...)_<Action>_<Index>_(hash).mp4
    โ†’ ์˜ค๋ฅธ์ชฝ๋ถ€ํ„ฐ ์Šค์บ”ํ•ด ์ฒซ ์ˆซ์ž ํ† ํฐ(2~3์ž๋ฆฌ)์„ 'Index'๋กœ ๊ฐ„์ฃผ,
      ๊ทธ ๋ฐ”๋กœ ์•ž ํ† ํฐ์„ ์•ก์…˜์œผ๋กœ ์‚ฌ์šฉ.
    """
    name = path_in_repo.rsplit("/", 1)[-1]
    stem = name.rsplit(".", 1)[0]
    toks = stem.split("_")
    # ์˜ค๋ฅธ์ชฝโ†’์™ผ์ชฝ ๋ฐฉํ–ฅ์œผ๋กœ ์ˆซ์ž ํ† ํฐ์„ ์ฐพ๋Š”๋‹ค (์˜ˆ: 01, 10, 123)
    for i in range(len(toks)-1, -1, -1):
        t = toks[i]
        if re.fullmatch(r"\d{2,3}", t):
            if i - 1 >= 0:
                action = toks[i - 1]
                return normalize_action(action)
            break
    return None


def main():
    base_url = f"https://huggingface.co/datasets/{REPO}/resolve/main/"
    files = list_repo_files(repo_id=REPO, repo_type="dataset")
    mp4s = [f for f in files if f.lower().endswith(".mp4")]
    if INCLUDE_FOLDERS:
        mp4s = [f for f in mp4s if any(f.startswith(folder + "/") for folder in INCLUDE_FOLDERS)]

    videos, bad = [], []
    for p in mp4s:
        action = extract_action_from_id(p)
        if action is None or action not in ALLOWED_ACTIONS:
            bad.append(p); continue
        url = base_url + p + ("?download=1" if ADD_DOWNLOAD_PARAM else "")
        videos.append({"url": url, "id": p, "action": action})

    random.shuffle(videos)
    with open(OUT_PATH, "w", encoding="utf-8") as fp:
        json.dump(videos, fp, ensure_ascii=False, indent=2)

    print(f"[DONE] wrote {OUT_PATH} with {len(videos)} items from repo={REPO}")
    from collections import Counter
    c = Counter(v["action"] for v in videos)
    for a in sorted(ALLOWED_ACTIONS):
        print(f"  {a:16s} = {c.get(a,0)}")
    if bad:
        print(f"[INFO] skipped (not matched to ALLOWED after normalize): {len(bad)}")
        for x in bad[:10]:
            print("   -", x)

if __name__ == "__main__":
    main()