wan2-2-fp8da-aoti-preview2

Running on Zero

App Files Files Community

r3gm commited on 9 days ago

Commit

6ab11e0

verified ·

1 Parent(s): 330958f

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -16

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ import tempfile
 import warnings
 import time
 import gc
 import cv2
 import numpy as np
@@ -53,6 +54,7 @@ function() {
 }
 """
 def extract_frame(video_path, timestamp):
     # Safety check: if no video is present
     if not video_path:
@@ -89,6 +91,11 @@ def extract_frame(video_path, timestamp):
 # --- END FRAME EXTRACTION LOGIC ---
 # RIFE
 if not os.path.exists("RIFEv4.26_0921.zip"):
     print("Downloading RIFE Model...")
@@ -111,6 +118,7 @@ rife_model.device()
 if torch.cuda.is_available():
     rife_model.flownet = rife_model.flownet.half()
 @torch.no_grad()
 def interpolate_bits(frames_np, multiplier=2, scale=1.0):
     """
@@ -181,7 +189,7 @@ def interpolate_bits(frames_np, multiplier=2, scale=1.0):
                 return [*first_half, *second_half]
     output_frames = []
     # Process Frames
     # Load first frame into GPU
     I1 = to_tensor(frames_np[0])
@@ -210,6 +218,7 @@ def interpolate_bits(frames_np, multiplier=2, scale=1.0):
     return output_frames
 # WAN
 MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
@@ -257,6 +266,9 @@ quantize_(pipe.transformer_2, Float8DynamicActivationFloat8WeightConfig())
 aoti.aoti_blocks_load(pipe.transformer, 'zerogpu-aoti/Wan2', variant='fp8da')
 aoti.aoti_blocks_load(pipe.transformer_2, 'zerogpu-aoti/Wan2', variant='fp8da')
 default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation"
 default_negative_prompt = "色调艳丽, 过曝, 静态, 细节模糊不清, 字幕, 风格, 作品, 画作, 画面, 静止, 整体发灰, 最差质量, 低质量, JPEG压缩残留, 丑陋的, 残缺的, 多余的手指, 画得不好的手部, 画得不好的脸部, 畸形的, 毁容的, 形态畸形的肢体, 手指融合, 静止不动的画面, 杂乱的背景, 三条腿, 背景人很多, 倒着走"
@@ -339,9 +351,7 @@ def get_inference_duration(
     gen_time = int(steps) * step_duration
     print(gen_time)
     if guidance_scale > 1:
-        gen_time = gen_time * 1.55
-    if guidance_scale_2 > 1:
-        gen_time = gen_time * 1.55
     if frame_multiplier > 1:
         total_out_frames = (num_frames * frame_multiplier) - num_frames
@@ -380,7 +390,10 @@ def run_inference(
             config['flow_shift'] = flow_shift
         pipe.scheduler = scheduler_class.from_config(config)
-    print(f"Generating {num_frames} frames with Wan...")
     start = time.time()
     result = pipe(
         image=resized_image,
@@ -398,7 +411,7 @@ def run_inference(
     )
     print("gen time passed:", time.time() - start)
-    raw_frames_np = result.frames[0] # Returns (T, H, W, C) float32
     pipe.scheduler = original_scheduler
     if frame_multiplier > 1:
@@ -417,8 +430,9 @@ def run_inference(
     start = time.time()
     export_to_video(final_frames, video_path, fps=final_fps, quality=quality)
     print(f"Export time passed, {final_fps} FPS:", time.time() - start)
-    return video_path
 def generate_video(
     input_image,
@@ -494,7 +508,7 @@ def generate_video(
     if last_image:
         processed_last_image = resize_and_crop_to_match(last_image, resized_image)
-    video_path = run_inference(
         resized_image,
         processed_last_image,
         prompt,
@@ -511,7 +525,7 @@ def generate_video(
         duration_seconds,
         progress,
     )
-    print("GPU complete")
     return (video_path if video_component else None), video_path, current_seed
@@ -531,13 +545,13 @@ CSS = """
 with gr.Blocks(delete_cache=(3600, 10800)) as demo:
-    gr.Markdown("## WAMU - Wan 2.2 I2V (14B)")
     gr.Markdown("#### ℹ️ **A Note on Performance:** This version prioritizes a straightforward setup over maximum speed, so performance may vary.")
-    gr.Markdown("run Wan 2.2 in just 4-8 steps, fp8 quantization & AoT compilation - compatible with 🧨 diffusers and ZeroGPU⚡️")
     with gr.Row():
         with gr.Column():
-            input_image_component = gr.Image(type="pil", label="Input Image")
             prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
             duration_seconds_input = gr.Slider(minimum=MIN_DURATION, maximum=MAX_DURATION, step=0.1, value=3.5, label="Duration (seconds)", info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps.")
             steps_slider = gr.Slider(minimum=1, maximum=30, step=1, value=6, label="Inference Steps")
@@ -554,7 +568,7 @@ with gr.Blocks(delete_cache=(3600, 10800)) as demo:
                 seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True)
                 randomize_seed_checkbox = gr.Checkbox(label="Randomize seed", value=True, interactive=True)
                 guidance_scale_input = gr.Slider(minimum=0.0, maximum=10.0, step=0.5, value=1, label="Guidance Scale - high noise stage", info="Values above 1 increase GPU usage and may take longer to process.")
-                guidance_scale_2_input = gr.Slider(minimum=0.0, maximum=10.0, step=0.5, value=1, label="Guidance Scale 2 - low noise stage", info="Values above 1 increase GPU usage and may take longer to process.")
                 scheduler_dropdown = gr.Dropdown(
                     label="Scheduler",
                     choices=list(SCHEDULER_MAP.keys()),
@@ -563,13 +577,14 @@ with gr.Blocks(delete_cache=(3600, 10800)) as demo:
                 )
                 flow_shift_slider = gr.Slider(minimum=0.5, maximum=15.0, step=0.1, value=3.0, label="Flow Shift")
                 play_result_video = gr.Checkbox(label="Display result", value=True, interactive=True)
-                gr.Markdown("[ZeroGPU Help, Tips, and Troubleshooting](https://huggingface.co/datasets/TestOrganizationPleaseIgnore/help/blob/main/gpu_help.md)")
             generate_button = gr.Button("Generate Video", variant="primary")
         with gr.Column():
             # ASSIGNED elem_id="generated-video" so JS can find it
-            video_output = gr.Video(label="Generated Video", autoplay=True, elem_id="generated-video")
             # --- Frame Grabbing UI ---
             with gr.Row():

 import warnings
 import time
 import gc
+import uuid
 import cv2
 import numpy as np
 }
 """
 def extract_frame(video_path, timestamp):
     # Safety check: if no video is present
     if not video_path:
 # --- END FRAME EXTRACTION LOGIC ---
+def clear_vram():
+    gc.collect()
+    torch.cuda.empty_cache()
 # RIFE
 if not os.path.exists("RIFEv4.26_0921.zip"):
     print("Downloading RIFE Model...")
 if torch.cuda.is_available():
     rife_model.flownet = rife_model.flownet.half()
 @torch.no_grad()
 def interpolate_bits(frames_np, multiplier=2, scale=1.0):
     """
                 return [*first_half, *second_half]
     output_frames = []
     # Process Frames
     # Load first frame into GPU
     I1 = to_tensor(frames_np[0])
     return output_frames
 # WAN
 MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
 aoti.aoti_blocks_load(pipe.transformer, 'zerogpu-aoti/Wan2', variant='fp8da')
 aoti.aoti_blocks_load(pipe.transformer_2, 'zerogpu-aoti/Wan2', variant='fp8da')
+# pipe.vae.enable_slicing()
+# pipe.vae.enable_tiling()
 default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation"
 default_negative_prompt = "色调艳丽, 过曝, 静态, 细节模糊不清, 字幕, 风格, 作品, 画作, 画面, 静止, 整体发灰, 最差质量, 低质量, JPEG压缩残留, 丑陋的, 残缺的, 多余的手指, 画得不好的手部, 画得不好的脸部, 畸形的, 毁容的, 形态畸形的肢体, 手指融合, 静止不动的画面, 杂乱的背景, 三条腿, 背景人很多, 倒着走"
     gen_time = int(steps) * step_duration
     print(gen_time)
     if guidance_scale > 1:
+        gen_time = gen_time * 1.8
     if frame_multiplier > 1:
         total_out_frames = (num_frames * frame_multiplier) - num_frames
             config['flow_shift'] = flow_shift
         pipe.scheduler = scheduler_class.from_config(config)
+    clear_vram()
+    task_name = str(uuid.uuid4())[:8]
+    print(f"Generating {num_frames} frames, task: {task_name}, {duration_seconds}, {resized_image.size}")
     start = time.time()
     result = pipe(
         image=resized_image,
     )
     print("gen time passed:", time.time() - start)
+    raw_frames_np = result.frames[0]  # Returns (T, H, W, C) float32
     pipe.scheduler = original_scheduler
     if frame_multiplier > 1:
     start = time.time()
     export_to_video(final_frames, video_path, fps=final_fps, quality=quality)
     print(f"Export time passed, {final_fps} FPS:", time.time() - start)
+    return video_path, task_name
 def generate_video(
     input_image,
     if last_image:
         processed_last_image = resize_and_crop_to_match(last_image, resized_image)
+    video_path, task_n = run_inference(
         resized_image,
         processed_last_image,
         prompt,
         duration_seconds,
         progress,
     )
+    print(f"GPU complete: {task_n}")
     return (video_path if video_component else None), video_path, current_seed
 with gr.Blocks(delete_cache=(3600, 10800)) as demo:
+    gr.Markdown("## WAMU - Wan 2.2 I2V (14B) 🐌")
     gr.Markdown("#### ℹ️ **A Note on Performance:** This version prioritizes a straightforward setup over maximum speed, so performance may vary.")
+    gr.Markdown("Run Wan 2.2 in just 4-8 steps, fp8 quantization & AoT compilation - compatible with 🧨 diffusers and ZeroGPU")
     with gr.Row():
         with gr.Column():
+            input_image_component = gr.Image(type="pil", label="Input Image", sources=["upload", "clipboard"])
             prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
             duration_seconds_input = gr.Slider(minimum=MIN_DURATION, maximum=MAX_DURATION, step=0.1, value=3.5, label="Duration (seconds)", info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps.")
             steps_slider = gr.Slider(minimum=1, maximum=30, step=1, value=6, label="Inference Steps")
                 seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True)
                 randomize_seed_checkbox = gr.Checkbox(label="Randomize seed", value=True, interactive=True)
                 guidance_scale_input = gr.Slider(minimum=0.0, maximum=10.0, step=0.5, value=1, label="Guidance Scale - high noise stage", info="Values above 1 increase GPU usage and may take longer to process.")
+                guidance_scale_2_input = gr.Slider(minimum=0.0, maximum=10.0, step=0.5, value=1, label="Guidance Scale 2 - low noise stage")
                 scheduler_dropdown = gr.Dropdown(
                     label="Scheduler",
                     choices=list(SCHEDULER_MAP.keys()),
                 )
                 flow_shift_slider = gr.Slider(minimum=0.5, maximum=15.0, step=0.1, value=3.0, label="Flow Shift")
                 play_result_video = gr.Checkbox(label="Display result", value=True, interactive=True)
+                org_name = "TestOrganizationPleaseIgnore"
+                gr.Markdown(f"[ZeroGPU Help, Tips, and Troubleshooting](https://huggingface.co/datasets/{org_name}/help/blob/main/gpu_help.md)")
             generate_button = gr.Button("Generate Video", variant="primary")
         with gr.Column():
             # ASSIGNED elem_id="generated-video" so JS can find it
+            video_output = gr.Video(label="Generated Video", autoplay=True, sources=["upload"], buttons=["download", "share"], interactive=True, elem_id="generated-video")
             # --- Frame Grabbing UI ---
             with gr.Row():