Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| import os, re, csv, hashlib, shutil, subprocess | |
| from pathlib import Path | |
| from collections import defaultdict | |
| from typing import List, Tuple | |
| from huggingface_hub import HfApi, list_repo_files, hf_hub_download, create_repo | |
| from huggingface_hub import whoami | |
| from huggingface_hub import CommitOperationAdd | |
| from huggingface_hub import snapshot_download | |
| import time, glob | |
| # ========= CONFIG ========= | |
| SRC_REPO = "XThomasBU/video_evals_ucf101" | |
| SRC_SUBDIRS = ["Hunyuan_videos", "Opensora_768", "RunwayGen4", "wan21_videos"] # νμν κ²λ§ | |
| # λμ λ‘컬 Wan2.2(60κ°)λ ν¬ν¨ | |
| WAN22_LOCAL_ROOT = Path("/projectnb/ivc-ml/youngsun/Video_Eval/Datasets/Wan2p2/Generated_UCF") | |
| WAN22_TARGET_SUBDIR = "Wan2.2" # λͺ©μ μ§ λ¦¬ν¬μ μμ ν΄λλͺ (μ°λ¦¬κ° μ°λ μ΄λ¦ μ μ§) | |
| DEST_REPO = "SGTLIM/ucf101_eval_unified" # β λ€ κ³μ μΌλ‘ κ³ μ | |
| DEST_REPO_IS_PRIVATE = False | |
| # μ©λ λλν μμ λλ ν 리( /tmp λμ /projectnb ) | |
| WORKDIR = Path("/projectnb/ivc-ml/youngsun/tmp_ucf_unified") | |
| STAGING = WORKDIR / "staging" # μ€ν μ΄μ§ λ£¨νΈ | |
| DL_ROOT = WORKDIR / "downloads" # μλ³Έ λ€μ΄λ‘λ μ μ₯μ | |
| MAKE_SILENT = False | |
| TOKEN = os.getenv("HF_TOKEN") | |
| # ========================= | |
| # /tmp λμ WORKDIRμ μμ/μΊμλ‘ μ¬μ© | |
| os.environ.setdefault("TMPDIR", str(WORKDIR / "_tmp")) | |
| os.environ.setdefault("HF_HOME", "/projectnb/ivc-ml/youngsun/.cache/huggingface") | |
| # λλ ν 리 보μ₯ | |
| (WORKDIR / "_tmp").mkdir(parents=True, exist_ok=True) | |
| STAGING.mkdir(parents=True, exist_ok=True) | |
| DL_ROOT.mkdir(parents=True, exist_ok=True) | |
| # μ‘μ λͺ μ κ·ν(νμμ μΆκ°) | |
| ALIAS = { | |
| "bodyweightsquats":"body_weight_squats", | |
| "bodysquats":"body_weight_squats", | |
| "body_weight_squats":"body_weight_squats", | |
| "hulahoop":"hula_hoop", | |
| "jumpingjack":"jumping_jack", | |
| "pullups":"pull_ups", | |
| "pushups":"push_ups", | |
| "throwdiscus":"throw_discus", | |
| "wallpushups":"wall_pushups", | |
| } | |
| def camel_action(s: str) -> str: | |
| # 'body_weight_squats' -> 'BodyWeightSquats' | |
| parts = s.strip("_").split("_") | |
| return "".join(p.capitalize() for p in parts if p) | |
| def extract_action_from_remote(rel_remote: str) -> str: | |
| """ | |
| HF 리ν¬μ 'μλ³Έ κ²½λ‘'μμλ§ μ‘μ μ λ½λλ€. | |
| μ: Hunyuan_videos/v_BodyWeightSquats_g05_c01.mp4 -> body_weight_squats | |
| """ | |
| base = os.path.basename(rel_remote) | |
| m = re.match(r"^v_([A-Za-z0-9]+)_", base) # λ°λμ v_<Action>_... | |
| if m: | |
| return slugify_action(m.group(1)) | |
| # (μμΈ μΌμ΄μ€ μ΅μν: μλ³Έ 리ν¬λ v_ν¨ν΄μ΄λ―λ‘ μ¬κΈ°κΉμ§ μ¬ μΌ κ±°μ μμ) | |
| return slugify_action(base.split("_", 1)[0]) | |
| def slugify_action(s: str) -> str: | |
| s = s.strip().lower().replace(" ", "_") | |
| s = re.sub(r"[^a-z0-9_]+", "_", s) | |
| s = re.sub(r"_+", "_", s).strip("_") | |
| return ALIAS.get(s, s) | |
| def model_slug(s: str) -> str: | |
| s = s.strip().lower() | |
| s = s.replace(" ", "_") | |
| s = re.sub(r"[^a-z0-9_]+", "_", s) | |
| s = re.sub(r"_+", "_", s).strip("_") | |
| return s | |
| def sha1_8(p: Path) -> str: | |
| h = hashlib.sha1() | |
| with open(p, "rb") as f: | |
| for chunk in iter(lambda: f.read(1024*1024), b""): | |
| h.update(chunk) | |
| return h.hexdigest()[:8] | |
| def ensure_ffmpeg(): | |
| # MAKE_SILENT=False μ΄λ©΄ ffmpeg μ²΄ν¬ μ ν¨ | |
| if MAKE_SILENT and not shutil.which("ffmpeg"): | |
| raise RuntimeError("ffmpeg not found, but MAKE_SILENT=True") | |
| def mute_copy(src: Path, dst: Path): | |
| # (μ§κΈμ μ°μ§ μμ§λ§, μ΅μ λμ΄λ¦¬κΈ° λλΉλ‘ λ¨κ²¨λ ) | |
| cmd = ["ffmpeg","-y","-i",str(src),"-c:v","copy","-an",str(dst)] | |
| try: | |
| subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
| except subprocess.CalledProcessError: | |
| cmd = [ | |
| "ffmpeg","-y","-i",str(src), | |
| "-vf","format=yuv420p","-movflags","+faststart", | |
| "-c:v","libx264","-crf","18","-preset","veryfast", | |
| "-an",str(dst) | |
| ] | |
| subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
| def extract_action_from_filename(fn: str) -> str: | |
| # μ: v_BodyWeightSquats_g05_c01.mp4 β BodyWeightSquats | |
| m = re.match(r"v_([A-Za-z0-9]+)", fn) | |
| if m: | |
| return slugify_action(m.group(1)) | |
| parts = fn.split("/") | |
| if len(parts) >= 2: | |
| return slugify_action(parts[-2]) | |
| stem = Path(fn).stem | |
| stem = re.sub(r"^\w+_", "", stem) | |
| return slugify_action(stem) | |
| def stage_from_hf_model(api, model_dir, rows): | |
| # 1) μλ³Έ λ¦¬ν¬ νμΌ λͺ©λ‘μμ μ΄ ν΄λμ mp4λ§ κ³ λ₯΄κΈ° | |
| all_remote = list_repo_files(repo_id=SRC_REPO, repo_type="dataset", token=TOKEN) | |
| remotes = [p for p in all_remote if p.startswith(model_dir + "/") and p.lower().endswith(".mp4")] | |
| print(f"[FETCH] {model_dir}: {len(remotes)} remote mp4 files") | |
| if not remotes: | |
| print(f"[WARN] no matches under {model_dir}") | |
| return | |
| # 2) μ΄ λͺ¨λΈ μ μ© λ€μ΄λ‘λ ν΄λλ₯Ό λ§€λ² κΉ¨λνκ² μμ± | |
| dl_dir = DL_ROOT / model_dir | |
| if dl_dir.exists(): | |
| shutil.rmtree(dl_dir) | |
| dl_dir.mkdir(parents=True, exist_ok=True) | |
| # 3) (remote_path, local_path) μμΌλ‘ λ³΄κ΄ (μ‘μ μΆμΆμ remote_pathλ‘!) | |
| pairs = [] | |
| for rel_remote in remotes: | |
| local = hf_hub_download( | |
| repo_id=SRC_REPO, | |
| filename=rel_remote, # β μλ³Έ κ²½λ‘ μ μ§ | |
| repo_type="dataset", | |
| token=TOKEN, | |
| local_dir=str(dl_dir), | |
| local_dir_use_symlinks=False, | |
| ) | |
| pairs.append((rel_remote, local)) | |
| # 4) μ€ν μ΄μ§μΌλ‘ μ΄λ + μ¬λ°λ₯Έ μ΄λ¦μΌλ‘ 리λ€μ | |
| folder_name = model_dir # μ: 'Hunyuan_videos', 'RunwayGen4', ... | |
| dst_dir = STAGING / folder_name | |
| dst_dir.mkdir(parents=True, exist_ok=True) | |
| counters = defaultdict(int) | |
| moved = 0 | |
| for rel_remote, local in sorted(pairs): | |
| # β μ‘μ μ 'μλ³Έ κ²½λ‘'μμλ§ μΆμΆ | |
| action_slug = extract_action_from_remote(rel_remote) # 'body_weight_squats' | |
| print(action_slug) | |
| action_camel = camel_action(action_slug) # 'BodyWeightSquats' | |
| print(action_camel) | |
| counters[action_slug] += 1 | |
| idx = counters[action_slug] | |
| h8 = sha1_8(Path(local)) | |
| # β μ΅μ’ κ·μΉ: Model_Action_λμ리_ν΄μ8.mp4 | |
| new_name = f"{folder_name}_{action_camel}_{idx:02d}_{h8}.mp4" | |
| dst = dst_dir / new_name | |
| shutil.move(local, dst) | |
| moved += 1 | |
| rows.append([f"hf://{SRC_REPO}/{rel_remote}", folder_name, action_slug, idx, h8, f"{folder_name}/{new_name}"]) | |
| print(f"[STAGED] {model_dir}: moved {moved} files to {dst_dir}") | |
| def stage_from_local_wan22(rows: List[List[str]]): | |
| pretty_model = "Wan2.2" | |
| counters = defaultdict(int) | |
| for class_dir in sorted([p for p in WAN22_LOCAL_ROOT.iterdir() if p.is_dir()]): | |
| action = slugify_action(class_dir.name) | |
| for mp4 in sorted([p for p in class_dir.iterdir() if p.suffix.lower()==".mp4"]): | |
| counters[action] += 1 | |
| idx = counters[action] | |
| h8 = sha1_8(mp4) | |
| # new_name = f"wan2p2_{action}_{idx:02d}_{h8}.mp4" | |
| pretty_model = WAN22_TARGET_SUBDIR # ex) 'Wan2.2' | |
| camel = camel_action(action) # ex) 'PushUps' | |
| new_name = f"{pretty_model}_{camel}_{idx:02d}_{h8}.mp4" | |
| dst_dir = STAGING / pretty_model | |
| dst_dir.mkdir(parents=True, exist_ok=True) | |
| dst = dst_dir / new_name | |
| if MAKE_SILENT: | |
| mute_copy(mp4, dst) | |
| else: | |
| shutil.copy2(mp4, dst) # π μλ³Έ κ·Έλλ‘ | |
| rows.append([str(mp4), pretty_model, action, idx, h8, f"{pretty_model}/{new_name}"]) | |
| def main(): | |
| if not TOKEN: | |
| raise SystemExit("Set HF_TOKEN with write permission.") | |
| ensure_ffmpeg() | |
| # π» μμ λλ ν 리 μ΄κΈ°ν (μ¬κΈ° μΆκ°) | |
| if STAGING.exists(): | |
| shutil.rmtree(STAGING) | |
| if DL_ROOT.exists(): | |
| shutil.rmtree(DL_ROOT) | |
| STAGING.mkdir(parents=True, exist_ok=True) | |
| DL_ROOT.mkdir(parents=True, exist_ok=True) | |
| api = HfApi() | |
| try: | |
| create_repo(repo_id=DEST_REPO, repo_type="dataset", private=DEST_REPO_IS_PRIVATE, token=TOKEN, exist_ok=True) | |
| except Exception: | |
| pass | |
| rows: List[List[str]] = [] | |
| # 1) μλ³Έ 리ν¬μμ 4κ° λͺ¨λΈ ν΄λλ§ μ€ν μ΄μ§ | |
| for sub in SRC_SUBDIRS: | |
| stage_from_hf_model(api, sub, rows) | |
| # 2) λ‘컬 Wan2.2(60κ°) μΆκ° | |
| stage_from_local_wan22(rows) | |
| # 3) λ§€ν csv μ μ₯ | |
| with open(STAGING / "mapping.csv", "w", newline="", encoding="utf-8") as f: | |
| w = csv.writer(f) | |
| w.writerow(["source","model","action","idx","hash8","dest_path"]) | |
| w.writerows(rows) | |
| # 4) ν΄λλ³λ‘ λͺ μμ μΆκ° 컀λ°(λ³κ²½κ°μ§ μ°ν) | |
| subdirs_to_push = ["Hunyuan_videos", "Opensora_768", "RunwayGen4", "wan21_videos"] #, "Wan2.2" | |
| for sd in subdirs_to_push: | |
| sd_path = STAGING / sd | |
| if not sd_path.exists(): | |
| print(f"[WARN] skip missing subdir: {sd}") | |
| continue | |
| files = sorted(str(p) for p in sd_path.rglob("*.mp4")) | |
| if not files: | |
| print(f"[WARN] skip empty subdir: {sd}") | |
| continue | |
| print(f"[PUSH] {sd} ... ({len(files)} files)") | |
| ops = [] | |
| for fp in files: | |
| rel_in_repo = os.path.relpath(fp, start=STAGING) # μ: Hunyuan_videos/xxx.mp4 | |
| ops.append(CommitOperationAdd(path_in_repo=rel_in_repo, path_or_fileobj=fp)) | |
| api.create_commit( | |
| repo_id=DEST_REPO, | |
| repo_type="dataset", | |
| operations=ops, | |
| commit_message=f"Add {sd} ({len(files)} files)", | |
| token=TOKEN, | |
| ) | |
| # mapping.csv μ λ‘λ | |
| map_csv = STAGING / "mapping.csv" | |
| if map_csv.exists(): | |
| api.upload_file( | |
| path_or_fileobj=str(map_csv), | |
| path_in_repo="mapping.csv", | |
| repo_id=DEST_REPO, | |
| repo_type="dataset", | |
| token=TOKEN, | |
| commit_message="Add mapping.csv", | |
| ) | |
| print(f"[DONE] Pushed to https://huggingface.co/datasets/{DEST_REPO}") | |
| if __name__ == "__main__": | |
| main() | |