Spaces:

dghadiya
/

video_eval

Sleeping

video_eval / make_json.py

Youngsun Lim

first

f543cdc about 2 months ago

3.5 kB

	# make_json.py (혹은 build_videos_json.py)

	import json, os, re, random
	from typing import Optional
	from huggingface_hub import list_repo_files

	REPO = os.getenv("VIDEO_DATASET_REPO", "SGTLIM/ucf101_eval_unified")
	INCLUDE_FOLDERS = [s.strip() for s in os.getenv("VIDEO_INCLUDE_FOLDERS", "").split(",") if s.strip()]
	OUT_PATH = "videos.json"
	ADD_DOWNLOAD_PARAM = False # 원격 재생 문제 있으면 False 유지

	# 10개 액션의 표준명(대소문자 포함)
	ALLOWED_ACTIONS = {
	"BodyWeightSquats","HulaHoop","JumpingJack","PullUps","PushUps",
	"Shotput","SoccerJuggling","TennisSwing","ThrowDiscus","WallPushups",
	}

	# 흔한 철자/대소문자/언더스코어 변형 → 표준명으로 치환
	ALIAS = {
	"bodyweightsquats":"BodyWeightSquats",
	"body_weight_squats":"BodyWeightSquats",
	"bodysquats":"BodyWeightSquats",

	"hulahoop":"HulaHoop",

	"jumpingjack":"JumpingJack",

	"pullups":"PullUps",

	"pushups":"PushUps",

	"shotput":"Shotput",

	"soccerjuggling":"SoccerJuggling",
	"soccer_juggling":"SoccerJuggling",

	"tennisswing":"TennisSwing",
	"tennis_swing":"TennisSwing",

	"throwdiscus":"ThrowDiscus",
	"throw_discus":"ThrowDiscus",

	"wallpushups":"WallPushups",
	"wall_pushups":"WallPushups",
	}

	def normalize_action(s: str) -> Optional[str]:
	key = re.sub(r"[^A-Za-z0-9]+", "_", s).lower().strip("_")
	canon = ALIAS.get(key)
	return canon if canon in ALLOWED_ACTIONS else None

	def is_index_token(tok: str) -> bool:
	return bool(re.fullmatch(r"\d{2,3}", tok))

	def extract_action_from_id(path_in_repo: str) -> Optional[str]:
	"""
	<Model>_(...optional tokens...)_<Action>_<Index>_(hash).mp4
	→ 오른쪽부터 스캔해 첫 숫자 토큰(2~3자리)을 'Index'로 간주,
	그 바로 앞 토큰을 액션으로 사용.
	"""
	name = path_in_repo.rsplit("/", 1)[-1]
	stem = name.rsplit(".", 1)[0]
	toks = stem.split("_")
	# 오른쪽→왼쪽 방향으로 숫자 토큰을 찾는다 (예: 01, 10, 123)
	for i in range(len(toks)-1, -1, -1):
	t = toks[i]
	if re.fullmatch(r"\d{2,3}", t):
	if i - 1 >= 0:
	action = toks[i - 1]
	return normalize_action(action)
	break
	return None


	def main():
	base_url = f"https://huggingface.co/datasets/{REPO}/resolve/main/"
	files = list_repo_files(repo_id=REPO, repo_type="dataset")
	mp4s = [f for f in files if f.lower().endswith(".mp4")]
	if INCLUDE_FOLDERS:
	mp4s = [f for f in mp4s if any(f.startswith(folder + "/") for folder in INCLUDE_FOLDERS)]

	videos, bad = [], []
	for p in mp4s:
	action = extract_action_from_id(p)
	if action is None or action not in ALLOWED_ACTIONS:
	bad.append(p); continue
	url = base_url + p + ("?download=1" if ADD_DOWNLOAD_PARAM else "")
	videos.append({"url": url, "id": p, "action": action})

	random.shuffle(videos)
	with open(OUT_PATH, "w", encoding="utf-8") as fp:
	json.dump(videos, fp, ensure_ascii=False, indent=2)

	print(f"[DONE] wrote {OUT_PATH} with {len(videos)} items from repo={REPO}")
	from collections import Counter
	c = Counter(v["action"] for v in videos)
	for a in sorted(ALLOWED_ACTIONS):
	print(f" {a:16s} = {c.get(a,0)}")
	if bad:
	print(f"[INFO] skipped (not matched to ALLOWED after normalize): {len(bad)}")
	for x in bad[:10]:
	print(" -", x)

	if __name__ == "__main__":
	main()