Spaces:

Ryan-PR
/

Refacade

Sleeping

App Files Files Community

Ryan-PR commited on 21 days ago

Commit

67a974c

verified ·

1 Parent(s): 491fe1d

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -72

app.py CHANGED Viewed

@@ -1,19 +1,13 @@
 import os
 import time
 import random
-import subprocess
-import importlib
 import gradio as gr
 import cv2
 import numpy as np
 from PIL import Image
-os.makedirs("./sam2/SAM2-Video-Predictor/checkpoints/", exist_ok=True)
-os.makedirs("./models/", exist_ok=True)
-from huggingface_hub import snapshot_download
 def ensure_wan():
     try:
@@ -24,9 +18,6 @@ def ensure_wan():
         env = dict(os.environ)
         print(f"[setup] Installing wan2.1: {cmd}")
         subprocess.run(cmd, shell=True, check=True, env=env)
-        importlib.invalidate_caches()
-        import wan  # noqa
-        print("[setup] wan installed.")
 def ensure_flash_attn():
     try:
@@ -48,6 +39,9 @@ def ensure_flash_attn():
 ensure_flash_attn()
 ensure_wan()
 def download_sam2():
     snapshot_download(
@@ -55,7 +49,7 @@ def download_sam2():
         local_dir="./sam2/SAM2-Video-Predictor/checkpoints/",
     )
     print("Download sam2 completed")
 def download_refacade():
     snapshot_download(
         repo_id="fishze/Refacade",
@@ -63,28 +57,25 @@ def download_refacade():
     )
     print("Download refacade completed")
 download_sam2()
 download_refacade()
 import torch
 import torch.nn.functional as F
 from decord import VideoReader, cpu
 from moviepy.editor import ImageSequenceClip
 from sam2.build_sam import build_sam2, build_sam2_video_predictor
 from sam2.sam2_image_predictor import SAM2ImagePredictor
 import spaces
 from vace.models.wan.modules.model_mm import VaceMMModel
 from vace.models.wan.modules.model_tr import VaceWanModel
-from wan.text2video import FlowUniPCMultistepScheduler
 from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
 from diffusers.utils import export_to_video, load_image, load_video
 from vae import WanVAE
 COLOR_PALETTE = [
     (255, 0, 0),
     (0, 255, 0),
@@ -101,10 +92,8 @@ COLOR_PALETTE = [
 video_length = 201
 W = 1024
 H = W
-DEVICE_SAM = "cpu"
-DEVICE_PIPE = "cuda"
 def get_pipe_image_and_video_predictor():
     vae = WanVAE(
@@ -112,61 +101,50 @@ def get_pipe_image_and_video_predictor():
         dtype=torch.float16,
     )
     texture_remover = VaceWanModel.from_config(
         "./models/texture_remover/texture_remover.json"
     )
-    ckpt_tr = torch.load(
         "./models/texture_remover/texture_remover.pth",
         map_location="cpu",
     )
-    texture_remover.load_state_dict(ckpt_tr)
-    texture_remover = texture_remover.to(dtype=torch.float16, device=DEVICE_PIPE)
     model = VaceMMModel.from_config(
         "./models/refacade/refacade.json"
     )
-    ckpt_ref = torch.load(
         "./models/refacade/refacade.pth",
         map_location="cpu",
     )
-    model.load_state_dict(ckpt_ref)
-    model = model.to(dtype=torch.float16, device=DEVICE_PIPE)
     sample_scheduler = FlowUniPCMultistepScheduler(
         num_train_timesteps=1000,
         shift=1,
     )
-    from pipeline import RefacadePipeline
     pipe = RefacadePipeline(
         vae=vae,
         transformer=model,
         texture_remover=texture_remover,
         scheduler=sample_scheduler,
     )
-    pipe.to(DEVICE_PIPE)
     sam2_checkpoint = "./sam2/SAM2-Video-Predictor/checkpoints/sam2_hiera_large.pt"
     config = "sam2_hiera_l.yaml"
-    video_predictor = build_sam2_video_predictor(
-        config,
-        sam2_checkpoint,
-        device="cuda",
-    )
-    model_sam = build_sam2(
-        config,
-        sam2_checkpoint,
-        device=DEVICE_SAM,
-    )
     model_sam.image_size = 1024
     image_predictor = SAM2ImagePredictor(sam_model=model_sam)
     return pipe, image_predictor, video_predictor
-pipe, image_predictor, video_predictor = get_pipe_image_and_video_predictor()
 def get_video_info(video_path, video_state):
     video_state["input_points"] = []
@@ -194,27 +172,26 @@ def get_video_info(video_path, video_state):
     image = Image.fromarray(first_frame)
     return image
 def segment_frame(evt: gr.SelectData, label, video_state):
     if video_state["origin_images"] is None:
         return None
     x, y = evt.index
     new_point = [x, y]
     label_value = 1 if label == "Positive" else 0
     video_state["input_points"].append(new_point)
     video_state["input_labels"].append(label_value)
     height, width = video_state["origin_images"][0].shape[0:2]
     scaled_points = []
     for pt in video_state["input_points"]:
         sx = pt[0] / width
         sy = pt[1] / height
         scaled_points.append([sx, sy])
     video_state["scaled_points"] = scaled_points
-    img0 = video_state["origin_images"][0]
-    image_predictor.set_image(img0)
     mask, _, _ = image_predictor.predict(
         point_coords=video_state["scaled_points"],
         point_labels=video_state["input_labels"],
@@ -231,10 +208,9 @@ def segment_frame(evt: gr.SelectData, label, video_state):
         / 255.0
     )
     color = color[None, None, :]
-    org_image = img0.astype(np.float32) / 255.0
     painted_image = (1 - mask * 0.5) * org_image + mask * 0.5 * color
     painted_image = np.uint8(np.clip(painted_image * 255, 0, 255))
     video_state["painted_images"] = np.expand_dims(painted_image, axis=0)
     video_state["masks"] = np.expand_dims(mask[:, :, 0], axis=0)
@@ -247,6 +223,7 @@ def segment_frame(evt: gr.SelectData, label, video_state):
     return Image.fromarray(painted_image)
 def clear_clicks(video_state):
     video_state["input_points"] = []
     video_state["input_labels"] = []
@@ -260,6 +237,7 @@ def clear_clicks(video_state):
         else None
     )
 def set_ref_image(ref_img, ref_state):
     if ref_img is None:
         return None
@@ -277,6 +255,7 @@ def set_ref_image(ref_img, ref_state):
     return Image.fromarray(img_np)
 def segment_ref_frame(evt: gr.SelectData, label, ref_state):
     if ref_state["origin_image"] is None:
         return None
@@ -320,7 +299,7 @@ def segment_ref_frame(evt: gr.SelectData, label, ref_state):
     painted = (1 - mask * 0.5) * org_image + mask * 0.5 * color
     painted = np.uint8(np.clip(painted * 255, 0, 255))
-    for i in range(len(ref_state["input_points"])):
         point = ref_state["input_points"][i]
         if ref_state["input_labels"][i] == 0:
             cv2.circle(painted, point, radius=3, color=(0, 0, 255), thickness=-1)
@@ -329,6 +308,7 @@ def segment_ref_frame(evt: gr.SelectData, label, ref_state):
     return Image.fromarray(painted)
 def clear_ref_clicks(ref_state):
     ref_state["input_points"] = []
     ref_state["input_labels"] = []
@@ -366,11 +346,11 @@ def track_video(n_frames, video_state):
     sam2_checkpoint = "./sam2/SAM2-Video-Predictor/checkpoints/sam2_hiera_large.pt"
     config = "sam2_hiera_l.yaml"
     video_predictor_local = build_sam2_video_predictor(
-        config, sam2_checkpoint, device="cuda"
     )
     inference_state = video_predictor_local.init_state(
-        images=images / 255, device="cuda"
     )
     if len(torch.from_numpy(video_state["masks"][0]).shape) == 3:
@@ -417,6 +397,7 @@ def track_video(n_frames, video_state):
     print("Tracking done")
     return video_file, video_state
 @spaces.GPU(duration=50)
 def inference_and_return_video(
     dilate_radius,
@@ -477,7 +458,7 @@ def inference_and_return_video(
     ref_mask_bin = (ref_mask_np > 0.5).astype(np.uint8) * 255
     ref_mask_pil = Image.fromarray(ref_mask_bin, mode="L")
-    pipe.to(DEVICE_PIPE)
     with torch.no_grad():
         retex_frames, mesh_frames, ref_img_out = pipe(
             video=video_frames,
@@ -493,7 +474,7 @@ def inference_and_return_video(
             guidance_scale=float(guidance_scale),
             reference_patch_ratio=float(ref_patch_ratio),
             fg_thresh=float(fg_threshold),
-            generator=torch.Generator(device=DEVICE_PIPE).manual_seed(seed),
             return_dict=False,
         )
@@ -522,7 +503,6 @@ def inference_and_return_video(
     return retex_video_file, mesh_video_file, ref_image_to_show
-# ================== Gradio UI ==================
 text = """
 <div style='text-align:center; font-size:32px; font-family: Arial, Helvetica, sans-serif;'>
@@ -533,6 +513,8 @@ text = """
 </div>
 """
 with gr.Blocks() as demo:
     video_state = gr.State(
         {
@@ -564,7 +546,7 @@ with gr.Blocks() as demo:
     with gr.Column():
         video_input = gr.Video(label="Upload Video", elem_id="my-video1")
         get_info_btn = gr.Button("Extract First Frame", elem_id="my-btn")
         gr.Examples(
             examples=[
                 ["./examples/1.mp4"],
@@ -576,7 +558,7 @@ with gr.Blocks() as demo:
             ],
             inputs=[video_input],
             label="You can upload or choose a source video below to retexture.",
-            elem_id="my-btn2",
         )
         image_output = gr.Image(
@@ -623,18 +605,6 @@ with gr.Blocks() as demo:
             width: 60% !important;
             margin: 0 auto;
         }
-        #my-btn3 button {
-            width: 120px !important;
-            max-width: 120px !important;
-            min-width: 120px !important;
-            height: 70px !important;
-            max-height: 70px !important;
-            min-height: 70px !important;
-            margin: 8px !important;
-            border-radius: 8px !important;
-            overflow: hidden !important;
-            white-space: normal !important;
-        }
         #ref_title {
             text-align: center;
         }
@@ -686,7 +656,7 @@ with gr.Blocks() as demo:
             ],
             inputs=[ref_image_input],
             label="You can upload or choose a reference image below to retexture.",
-            elem_id="my-btn3",
         )
         ref_image_display = gr.Image(
             label="Reference Mask Segmentation",
@@ -742,7 +712,7 @@ with gr.Blocks() as demo:
                 maximum=2147483647,
                 value=42,
                 step=1,
-                label="Seed",
             )
         remove_btn = gr.Button("Retexture", elem_id="my-btn")

 import os
 import time
 import random
 import gradio as gr
 import cv2
 import numpy as np
 from PIL import Image
+import subprocess
+import importlib
 def ensure_wan():
     try:
         env = dict(os.environ)
         print(f"[setup] Installing wan2.1: {cmd}")
         subprocess.run(cmd, shell=True, check=True, env=env)
 def ensure_flash_attn():
     try:
 ensure_flash_attn()
 ensure_wan()
+os.makedirs("./sam2/SAM2-Video-Predictor/checkpoints/", exist_ok=True)
+from huggingface_hub import snapshot_download
 def download_sam2():
     snapshot_download(
         local_dir="./sam2/SAM2-Video-Predictor/checkpoints/",
     )
     print("Download sam2 completed")
 def download_refacade():
     snapshot_download(
         repo_id="fishze/Refacade",
     )
     print("Download refacade completed")
 download_sam2()
 download_refacade()
 import torch
 import torch.nn.functional as F
 from decord import VideoReader, cpu
 from moviepy.editor import ImageSequenceClip
 from sam2.build_sam import build_sam2, build_sam2_video_predictor
 from sam2.sam2_image_predictor import SAM2ImagePredictor
 import spaces
+from pipeline import RefacadePipeline
 from vace.models.wan.modules.model_mm import VaceMMModel
 from vace.models.wan.modules.model_tr import VaceWanModel
 from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from wan.text2video import FlowUniPCMultistepScheduler
 from diffusers.utils import export_to_video, load_image, load_video
 from vae import WanVAE
 COLOR_PALETTE = [
     (255, 0, 0),
     (0, 255, 0),
 video_length = 201
 W = 1024
 H = W
+device = "cuda"
+sam_device = "cpu"
 def get_pipe_image_and_video_predictor():
     vae = WanVAE(
         dtype=torch.float16,
     )
+    pipe_device = "cuda"
     texture_remover = VaceWanModel.from_config(
         "./models/texture_remover/texture_remover.json"
     )
+    ckpt = torch.load(
         "./models/texture_remover/texture_remover.pth",
         map_location="cpu",
     )
+    texture_remover.load_state_dict(ckpt)
+    texture_remover = texture_remover.to(dtype=torch.float16, device=pipe_device)
     model = VaceMMModel.from_config(
         "./models/refacade/refacade.json"
     )
+    ckpt = torch.load(
         "./models/refacade/refacade.pth",
         map_location="cpu",
     )
+    model.load_state_dict(ckpt)
+    model = model.to(dtype=torch.float16, device=pipe_device)
     sample_scheduler = FlowUniPCMultistepScheduler(
         num_train_timesteps=1000,
         shift=1,
     )
     pipe = RefacadePipeline(
         vae=vae,
         transformer=model,
         texture_remover=texture_remover,
         scheduler=sample_scheduler,
     )
+    pipe.to(pipe_device)
     sam2_checkpoint = "./sam2/SAM2-Video-Predictor/checkpoints/sam2_hiera_large.pt"
     config = "sam2_hiera_l.yaml"
+    video_predictor = build_sam2_video_predictor(config, sam2_checkpoint, device=sam_device)
+    model_sam = build_sam2(config, sam2_checkpoint, device=sam_device)
     model_sam.image_size = 1024
     image_predictor = SAM2ImagePredictor(sam_model=model_sam)
     return pipe, image_predictor, video_predictor
 def get_video_info(video_path, video_state):
     video_state["input_points"] = []
     image = Image.fromarray(first_frame)
     return image
 def segment_frame(evt: gr.SelectData, label, video_state):
     if video_state["origin_images"] is None:
         return None
     x, y = evt.index
     new_point = [x, y]
     label_value = 1 if label == "Positive" else 0
     video_state["input_points"].append(new_point)
     video_state["input_labels"].append(label_value)
     height, width = video_state["origin_images"][0].shape[0:2]
     scaled_points = []
     for pt in video_state["input_points"]:
         sx = pt[0] / width
         sy = pt[1] / height
         scaled_points.append([sx, sy])
     video_state["scaled_points"] = scaled_points
+    image_predictor.set_image(video_state["origin_images"][0])
     mask, _, _ = image_predictor.predict(
         point_coords=video_state["scaled_points"],
         point_labels=video_state["input_labels"],
         / 255.0
     )
     color = color[None, None, :]
+    org_image = video_state["origin_images"][0].astype(np.float32) / 255.0
     painted_image = (1 - mask * 0.5) * org_image + mask * 0.5 * color
     painted_image = np.uint8(np.clip(painted_image * 255, 0, 255))
     video_state["painted_images"] = np.expand_dims(painted_image, axis=0)
     video_state["masks"] = np.expand_dims(mask[:, :, 0], axis=0)
     return Image.fromarray(painted_image)
 def clear_clicks(video_state):
     video_state["input_points"] = []
     video_state["input_labels"] = []
         else None
     )
 def set_ref_image(ref_img, ref_state):
     if ref_img is None:
         return None
     return Image.fromarray(img_np)
 def segment_ref_frame(evt: gr.SelectData, label, ref_state):
     if ref_state["origin_image"] is None:
         return None
     painted = (1 - mask * 0.5) * org_image + mask * 0.5 * color
     painted = np.uint8(np.clip(painted * 255, 0, 255))
+    for i in range(len(ref_state["input_points"]))):
         point = ref_state["input_points"][i]
         if ref_state["input_labels"][i] == 0:
             cv2.circle(painted, point, radius=3, color=(0, 0, 255), thickness=-1)
     return Image.fromarray(painted)
 def clear_ref_clicks(ref_state):
     ref_state["input_points"] = []
     ref_state["input_labels"] = []
     sam2_checkpoint = "./sam2/SAM2-Video-Predictor/checkpoints/sam2_hiera_large.pt"
     config = "sam2_hiera_l.yaml"
     video_predictor_local = build_sam2_video_predictor(
+        config, sam2_checkpoint, device=sam_device
     )
     inference_state = video_predictor_local.init_state(
+        images=images / 255, device=sam_device
     )
     if len(torch.from_numpy(video_state["masks"][0]).shape) == 3:
     print("Tracking done")
     return video_file, video_state
 @spaces.GPU(duration=50)
 def inference_and_return_video(
     dilate_radius,
     ref_mask_bin = (ref_mask_np > 0.5).astype(np.uint8) * 255
     ref_mask_pil = Image.fromarray(ref_mask_bin, mode="L")
+    pipe.to("cuda")
     with torch.no_grad():
         retex_frames, mesh_frames, ref_img_out = pipe(
             video=video_frames,
             guidance_scale=float(guidance_scale),
             reference_patch_ratio=float(ref_patch_ratio),
             fg_thresh=float(fg_threshold),
+            generator=torch.Generator(device="cuda").manual_seed(seed),
             return_dict=False,
         )
     return retex_video_file, mesh_video_file, ref_image_to_show
 text = """
 <div style='text-align:center; font-size:32px; font-family: Arial, Helvetica, sans-serif;'>
 </div>
 """
+pipe, image_predictor, video_predictor = get_pipe_image_and_video_predictor()
 with gr.Blocks() as demo:
     video_state = gr.State(
         {
     with gr.Column():
         video_input = gr.Video(label="Upload Video", elem_id="my-video1")
         get_info_btn = gr.Button("Extract First Frame", elem_id="my-btn")
         gr.Examples(
             examples=[
                 ["./examples/1.mp4"],
             ],
             inputs=[video_input],
             label="You can upload or choose a source video below to retexture.",
+            elem_id="my-btn2"
         )
         image_output = gr.Image(
             width: 60% !important;
             margin: 0 auto;
         }
         #ref_title {
             text-align: center;
         }
             ],
             inputs=[ref_image_input],
             label="You can upload or choose a reference image below to retexture.",
+            elem_id="my-btn3"
         )
         ref_image_display = gr.Image(
             label="Reference Mask Segmentation",
                 maximum=2147483647,
                 value=42,
                 step=1,
+                label="Seed",
             )
         remove_btn = gr.Button("Retexture", elem_id="my-btn")