video_eval / make_my_repo_dataset.py
Youngsun Lim
first
f543cdc
raw
history blame
10.3 kB
#!/usr/bin/env python3
import os, re, csv, hashlib, shutil, subprocess
from pathlib import Path
from collections import defaultdict
from typing import List, Tuple
from huggingface_hub import HfApi, list_repo_files, hf_hub_download, create_repo
from huggingface_hub import whoami
from huggingface_hub import CommitOperationAdd
from huggingface_hub import snapshot_download
import time, glob
# ========= CONFIG =========
SRC_REPO = "XThomasBU/video_evals_ucf101"
SRC_SUBDIRS = ["Hunyuan_videos", "Opensora_768", "RunwayGen4", "wan21_videos"] # ν•„μš”ν•œ κ²ƒλ§Œ
# λ„ˆμ˜ 둜컬 Wan2.2(60개)도 포함
WAN22_LOCAL_ROOT = Path("/projectnb/ivc-ml/youngsun/Video_Eval/Datasets/Wan2p2/Generated_UCF")
WAN22_TARGET_SUBDIR = "Wan2.2" # λͺ©μ μ§€ 리포의 μƒμœ„ 폴더λͺ…(μš°λ¦¬κ°€ μ“°λ˜ 이름 μœ μ§€)
DEST_REPO = "SGTLIM/ucf101_eval_unified" # ← λ„€ κ³„μ •μœΌλ‘œ κ³ μ •
DEST_REPO_IS_PRIVATE = False
# μš©λŸ‰ λ„‰λ„‰ν•œ μž‘μ—… 디렉토리( /tmp λŒ€μ‹  /projectnb )
WORKDIR = Path("/projectnb/ivc-ml/youngsun/tmp_ucf_unified")
STAGING = WORKDIR / "staging" # μŠ€ν…Œμ΄μ§• 루트
DL_ROOT = WORKDIR / "downloads" # 원본 λ‹€μš΄λ‘œλ“œ μ €μž₯μ†Œ
MAKE_SILENT = False
TOKEN = os.getenv("HF_TOKEN")
# =========================
# /tmp λŒ€μ‹  WORKDIR을 μž„μ‹œ/μΊμ‹œλ‘œ μ‚¬μš©
os.environ.setdefault("TMPDIR", str(WORKDIR / "_tmp"))
os.environ.setdefault("HF_HOME", "/projectnb/ivc-ml/youngsun/.cache/huggingface")
# 디렉토리 보μž₯
(WORKDIR / "_tmp").mkdir(parents=True, exist_ok=True)
STAGING.mkdir(parents=True, exist_ok=True)
DL_ROOT.mkdir(parents=True, exist_ok=True)
# μ•‘μ…˜λͺ… μ •κ·œν™”(ν•„μš”μ‹œ μΆ”κ°€)
ALIAS = {
"bodyweightsquats":"body_weight_squats",
"bodysquats":"body_weight_squats",
"body_weight_squats":"body_weight_squats",
"hulahoop":"hula_hoop",
"jumpingjack":"jumping_jack",
"pullups":"pull_ups",
"pushups":"push_ups",
"throwdiscus":"throw_discus",
"wallpushups":"wall_pushups",
}
def camel_action(s: str) -> str:
# 'body_weight_squats' -> 'BodyWeightSquats'
parts = s.strip("_").split("_")
return "".join(p.capitalize() for p in parts if p)
def extract_action_from_remote(rel_remote: str) -> str:
"""
HF 리포의 '원본 경둜'μ—μ„œλ§Œ μ•‘μ…˜μ„ λ½‘λŠ”λ‹€.
예: Hunyuan_videos/v_BodyWeightSquats_g05_c01.mp4 -> body_weight_squats
"""
base = os.path.basename(rel_remote)
m = re.match(r"^v_([A-Za-z0-9]+)_", base) # λ°˜λ“œμ‹œ v_<Action>_...
if m:
return slugify_action(m.group(1))
# (μ˜ˆμ™Έ μΌ€μ΄μŠ€ μ΅œμ†Œν™”: 원본 λ¦¬ν¬λŠ” v_νŒ¨ν„΄μ΄λ―€λ‘œ μ—¬κΈ°κΉŒμ§€ 올 일 거의 μ—†μŒ)
return slugify_action(base.split("_", 1)[0])
def slugify_action(s: str) -> str:
s = s.strip().lower().replace(" ", "_")
s = re.sub(r"[^a-z0-9_]+", "_", s)
s = re.sub(r"_+", "_", s).strip("_")
return ALIAS.get(s, s)
def model_slug(s: str) -> str:
s = s.strip().lower()
s = s.replace(" ", "_")
s = re.sub(r"[^a-z0-9_]+", "_", s)
s = re.sub(r"_+", "_", s).strip("_")
return s
def sha1_8(p: Path) -> str:
h = hashlib.sha1()
with open(p, "rb") as f:
for chunk in iter(lambda: f.read(1024*1024), b""):
h.update(chunk)
return h.hexdigest()[:8]
def ensure_ffmpeg():
# MAKE_SILENT=False 이면 ffmpeg 체크 μ•ˆ 함
if MAKE_SILENT and not shutil.which("ffmpeg"):
raise RuntimeError("ffmpeg not found, but MAKE_SILENT=True")
def mute_copy(src: Path, dst: Path):
# (μ§€κΈˆμ€ μ“°μ§€ μ•Šμ§€λ§Œ, μ˜΅μ…˜ λ˜μ‚΄λ¦¬κΈ° λŒ€λΉ„λ‘œ 남겨둠)
cmd = ["ffmpeg","-y","-i",str(src),"-c:v","copy","-an",str(dst)]
try:
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
except subprocess.CalledProcessError:
cmd = [
"ffmpeg","-y","-i",str(src),
"-vf","format=yuv420p","-movflags","+faststart",
"-c:v","libx264","-crf","18","-preset","veryfast",
"-an",str(dst)
]
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
def extract_action_from_filename(fn: str) -> str:
# 예: v_BodyWeightSquats_g05_c01.mp4 β†’ BodyWeightSquats
m = re.match(r"v_([A-Za-z0-9]+)", fn)
if m:
return slugify_action(m.group(1))
parts = fn.split("/")
if len(parts) >= 2:
return slugify_action(parts[-2])
stem = Path(fn).stem
stem = re.sub(r"^\w+_", "", stem)
return slugify_action(stem)
def stage_from_hf_model(api, model_dir, rows):
# 1) 원본 리포 파일 λͺ©λ‘μ—μ„œ 이 ν΄λ”μ˜ mp4만 κ³ λ₯΄κΈ°
all_remote = list_repo_files(repo_id=SRC_REPO, repo_type="dataset", token=TOKEN)
remotes = [p for p in all_remote if p.startswith(model_dir + "/") and p.lower().endswith(".mp4")]
print(f"[FETCH] {model_dir}: {len(remotes)} remote mp4 files")
if not remotes:
print(f"[WARN] no matches under {model_dir}")
return
# 2) 이 λͺ¨λΈ μ „μš© λ‹€μš΄λ‘œλ“œ 폴더λ₯Ό 맀번 κΉ¨λ—ν•˜κ²Œ 생성
dl_dir = DL_ROOT / model_dir
if dl_dir.exists():
shutil.rmtree(dl_dir)
dl_dir.mkdir(parents=True, exist_ok=True)
# 3) (remote_path, local_path) 쌍으둜 보관 (μ•‘μ…˜ μΆ”μΆœμ€ remote_path둜!)
pairs = []
for rel_remote in remotes:
local = hf_hub_download(
repo_id=SRC_REPO,
filename=rel_remote, # ← 원본 경둜 μœ μ§€
repo_type="dataset",
token=TOKEN,
local_dir=str(dl_dir),
local_dir_use_symlinks=False,
)
pairs.append((rel_remote, local))
# 4) μŠ€ν…Œμ΄μ§•μœΌλ‘œ 이동 + μ˜¬λ°”λ₯Έ μ΄λ¦„μœΌλ‘œ λ¦¬λ„€μž„
folder_name = model_dir # 예: 'Hunyuan_videos', 'RunwayGen4', ...
dst_dir = STAGING / folder_name
dst_dir.mkdir(parents=True, exist_ok=True)
counters = defaultdict(int)
moved = 0
for rel_remote, local in sorted(pairs):
# βœ… μ•‘μ…˜μ€ '원본 경둜'μ—μ„œλ§Œ μΆ”μΆœ
action_slug = extract_action_from_remote(rel_remote) # 'body_weight_squats'
print(action_slug)
action_camel = camel_action(action_slug) # 'BodyWeightSquats'
print(action_camel)
counters[action_slug] += 1
idx = counters[action_slug]
h8 = sha1_8(Path(local))
# βœ… μ΅œμ’… κ·œμΉ™: Model_Action_λ‘μžλ¦¬_ν•΄μ‹œ8.mp4
new_name = f"{folder_name}_{action_camel}_{idx:02d}_{h8}.mp4"
dst = dst_dir / new_name
shutil.move(local, dst)
moved += 1
rows.append([f"hf://{SRC_REPO}/{rel_remote}", folder_name, action_slug, idx, h8, f"{folder_name}/{new_name}"])
print(f"[STAGED] {model_dir}: moved {moved} files to {dst_dir}")
def stage_from_local_wan22(rows: List[List[str]]):
pretty_model = "Wan2.2"
counters = defaultdict(int)
for class_dir in sorted([p for p in WAN22_LOCAL_ROOT.iterdir() if p.is_dir()]):
action = slugify_action(class_dir.name)
for mp4 in sorted([p for p in class_dir.iterdir() if p.suffix.lower()==".mp4"]):
counters[action] += 1
idx = counters[action]
h8 = sha1_8(mp4)
# new_name = f"wan2p2_{action}_{idx:02d}_{h8}.mp4"
pretty_model = WAN22_TARGET_SUBDIR # ex) 'Wan2.2'
camel = camel_action(action) # ex) 'PushUps'
new_name = f"{pretty_model}_{camel}_{idx:02d}_{h8}.mp4"
dst_dir = STAGING / pretty_model
dst_dir.mkdir(parents=True, exist_ok=True)
dst = dst_dir / new_name
if MAKE_SILENT:
mute_copy(mp4, dst)
else:
shutil.copy2(mp4, dst) # πŸ”Š 원본 κ·ΈλŒ€λ‘œ
rows.append([str(mp4), pretty_model, action, idx, h8, f"{pretty_model}/{new_name}"])
def main():
if not TOKEN:
raise SystemExit("Set HF_TOKEN with write permission.")
ensure_ffmpeg()
# πŸ”» μž‘μ—… 디렉토리 μ΄ˆκΈ°ν™” (μ—¬κΈ° μΆ”κ°€)
if STAGING.exists():
shutil.rmtree(STAGING)
if DL_ROOT.exists():
shutil.rmtree(DL_ROOT)
STAGING.mkdir(parents=True, exist_ok=True)
DL_ROOT.mkdir(parents=True, exist_ok=True)
api = HfApi()
try:
create_repo(repo_id=DEST_REPO, repo_type="dataset", private=DEST_REPO_IS_PRIVATE, token=TOKEN, exist_ok=True)
except Exception:
pass
rows: List[List[str]] = []
# 1) 원본 λ¦¬ν¬μ—μ„œ 4개 λͺ¨λΈ ν΄λ”λ§Œ μŠ€ν…Œμ΄μ§•
for sub in SRC_SUBDIRS:
stage_from_hf_model(api, sub, rows)
# 2) 둜컬 Wan2.2(60개) μΆ”κ°€
stage_from_local_wan22(rows)
# 3) λ§€ν•‘ csv μ €μž₯
with open(STAGING / "mapping.csv", "w", newline="", encoding="utf-8") as f:
w = csv.writer(f)
w.writerow(["source","model","action","idx","hash8","dest_path"])
w.writerows(rows)
# 4) ν΄λ”λ³„λ‘œ λͺ…μ‹œμ  μΆ”κ°€ 컀밋(변경감지 우회)
subdirs_to_push = ["Hunyuan_videos", "Opensora_768", "RunwayGen4", "wan21_videos"] #, "Wan2.2"
for sd in subdirs_to_push:
sd_path = STAGING / sd
if not sd_path.exists():
print(f"[WARN] skip missing subdir: {sd}")
continue
files = sorted(str(p) for p in sd_path.rglob("*.mp4"))
if not files:
print(f"[WARN] skip empty subdir: {sd}")
continue
print(f"[PUSH] {sd} ... ({len(files)} files)")
ops = []
for fp in files:
rel_in_repo = os.path.relpath(fp, start=STAGING) # 예: Hunyuan_videos/xxx.mp4
ops.append(CommitOperationAdd(path_in_repo=rel_in_repo, path_or_fileobj=fp))
api.create_commit(
repo_id=DEST_REPO,
repo_type="dataset",
operations=ops,
commit_message=f"Add {sd} ({len(files)} files)",
token=TOKEN,
)
# mapping.csv μ—…λ‘œλ“œ
map_csv = STAGING / "mapping.csv"
if map_csv.exists():
api.upload_file(
path_or_fileobj=str(map_csv),
path_in_repo="mapping.csv",
repo_id=DEST_REPO,
repo_type="dataset",
token=TOKEN,
commit_message="Add mapping.csv",
)
print(f"[DONE] Pushed to https://huggingface.co/datasets/{DEST_REPO}")
if __name__ == "__main__":
main()