import tempfile import gradio as gr import imageio import spaces import torch import torchvision import numpy as np from PIL import Image from einops import rearrange # lables labels_k = [ 'yaw1', 'yaw2', 'pitch', 'roll1', 'roll2', 'neck', 'pout', 'open->close', '"O" Mouth', 'smile', 'close->open', 'eyebrows', 'eyeballs1', 'eyeballs2', ] labels_v = [ 37, 39, 28, 15, 33, 31, 6, 25, 16, 19, 13, 24, 17, 26 ] @torch.compiler.allow_in_graph def load_image(img, size): img = Image.open(img).convert('RGB') w, h = img.size img = img.resize((size, size)) img = np.asarray(img) img = np.copy(img) img = np.transpose(img, (2, 0, 1)) # 3 x 256 x 256 return img / 255.0, w, h @torch.compiler.allow_in_graph def img_preprocessing(img_path, size): img, w, h = load_image(img_path, size) # [0, 1] img = torch.from_numpy(img).unsqueeze(0).float() # [0, 1] imgs_norm = (img - 0.5) * 2.0 # [-1, 1] return imgs_norm, w, h # Pre-compile resize transforms for better performance resize_transform_cache = {} def get_resize_transform(size): """Get cached resize transform - creates once, reuses many times""" if size not in resize_transform_cache: # Only create the transform if it doesn't exist in cache resize_transform_cache[size] = torchvision.transforms.Resize( size, interpolation=torchvision.transforms.InterpolationMode.BILINEAR, antialias=True ) return resize_transform_cache[size] def resize(img, size): """Use cached resize transform""" transform = get_resize_transform((size, size)) return transform(img) def resize_back(img, w, h): """Use cached resize transform for back operation""" transform = get_resize_transform((h, w)) return transform(img) def vid_preprocessing(vid_path, size): vid_dict = torchvision.io.read_video(vid_path, pts_unit='sec') vid = vid_dict[0].permute(0, 3, 1, 2) # tchw fps = vid_dict[2]['video_fps'] vid_norm = (vid / 255.0 - 0.5) * 2.0 # [-1, 1] #vid_norm = torch.cat([ # resize(vid_norm[i:i+1, :, :, :], size).unsqueeze(1) for i in range(vid.size(0)) #], dim=1) vid_norm = resize(vid_norm, size) # tchw return vid_norm, fps def img_denorm(img): img = img.clamp(-1, 1) img = (img - img.min()) / (img.max() - img.min()) return img def vid_denorm(vid): vid = vid.clamp(-1, 1) vid = (vid - vid.min()) / (vid.max() - vid.min()) return vid def img_postprocessing(image, w, h): img = resize_back(image, w, h) # Denormalize ON GPU (avoid early CPU transfer) img = img.clamp(-1, 1) # Still on GPU img = (img - img.min()) / (img.max() - img.min()) # Still on GPU # Single optimized CPU transfer img = img.squeeze(0).permute(1, 2, 0).contiguous() # contiguous() for fast transfer img_output = (img.cpu().numpy() * 255).astype(np.uint8) # Single CPU transfer # return the Numpy array directly, since Gradio supports it return img_output def vid_postprocessing(video, w, h, fps): # video: TCHW t,c,_,_ = video.size() vid = resize_back(video, w, h) vid = vid.clamp(-1, 1) vid = (vid - vid.min()) / (vid.max() - vid.min()) vid = rearrange(vid, "t c h w -> t h w c") # T H W C vid_np = (vid.cpu().numpy() * 255).astype('uint8') with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file: imageio.mimwrite(temp_file.name, vid_np, fps=fps, codec='libx264', quality=8) return temp_file.name def animation(gen, chunk_size, device): @torch.compile def compiled_enc_img(image_tensor, selected_s): """Compiled version of just the model inference""" return gen.enc_img(image_tensor, labels_v, selected_s) @torch.compile def compiled_dec_img(z_s2r, alpha_r2s, feat_rgb): """Compiled version of just the model inference""" return gen.dec_img(z_s2r, alpha_r2s, feat_rgb) @torch.compile def compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target_batch): """Compiled version of animate_batch for animation tab""" return gen.dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target_batch) # Pre-warm the compiled model with dummy data to reduce first-run compilation time def _warmup_model(): """Pre-warm the model compilation with representative shapes""" print("[img_edit] Pre-warming model compilation...") dummy_image = torch.randn(1, 3, 512, 512, device=device) dummy_video = torch.randn(chunk_size, 3, 512, 512, device=device) dummy_selected_s = [0.0] * len(labels_v) try: with torch.inference_mode(): z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(dummy_image, dummy_selected_s) _ = compiled_dec_img(z_s2r, alpha_r2s, feat_rgb) print("[img_edit] Model pre-warming completed successfully") except Exception as e: print(f"[img_edit] Model pre-warming failed (will compile on first use): {e}") try: with torch.inference_mode(): z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(dummy_image, dummy_selected_s) _ = compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, dummy_video[0], dummy_video) print("[img_animation] Model pre-warming completed successfully") except Exception as e: print(f"[img_animation] Model pre-warming failed (will compile on first use): {e}") # Pre-warm the model _warmup_model() @spaces.GPU @torch.inference_mode() def edit_media(image, *selected_s): image_tensor, w, h = img_preprocessing(image, 512) image_tensor = image_tensor.to(device) z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(image_tensor, selected_s) edited_image_tensor = compiled_dec_img(z_s2r, alpha_r2s, feat_rgb) # de-norm edited_image = img_postprocessing(edited_image_tensor, w, h) return edited_image @spaces.GPU @torch.inference_mode() def animate_media(image, video, *selected_s): image_tensor, w, h = img_preprocessing(image, 512) vid_target_tensor, fps = vid_preprocessing(video, 512) image_tensor = image_tensor.to(device) video_target_tensor = vid_target_tensor.to(device) #tchw #animated_video = gen.animate_batch(image_tensor, video_target_tensor, labels_v, selected_s, chunk_size) #edited_image = animated_video[:,:,0,:,:] img_start = video_target_tensor[0:1,:,:,:] #vid_target_tensor_batch = rearrange(video_target_tensor, 'b t c h w -> (b t) c h w') res = [] t = video_target_tensor.size(1) chunks = t // chunk_size z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(image_tensor, selected_s) #z_s2r, alpha_r2s, feat_rgb = gen.enc_img(image_tensor, labels_v, selected_s) for i in range(chunks+1): if i == chunks: img_target = vid_target_tensor[i*chunk_size:, :, :, :] img_animated = compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target) #img_animated_batch = gen.dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target_batch) else: img_target = vid_target_tensor[i*chunk_size:(i+1)*chunk_size, :, :, :] img_animated = compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target) #img_animated_batch = gen.dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target_batch) res.append(img_animated) animated_video = torch.cat(res, dim=0) # TCHW edited_image = animated_video[0:1,:,:,:] # postprocessing animated_video = vid_postprocessing(animated_video, w, h, fps) edited_image = img_postprocessing(edited_image, w, h) return edited_image, animated_video def clear_media(): return None, None, *([0] * len(labels_k)) with gr.Tab("Image Animation"): inputs_s = [] with gr.Row(): with gr.Column(scale=1): with gr.Row(): with gr.Accordion(open=True, label="Source Image"): image_input = gr.Image(type="filepath", elem_id="input_img", width=512) # , height=550) gr.Examples( examples=[ ["./data/source/macron.png"], ["./data/source/einstein.png"], ["./data/source/taylor.png"], ["./data/source/portrait1.png"], ["./data/source/portrait2.png"], ["./data/source/portrait3.png"], ], inputs=[image_input], visible=True, ) with gr.Accordion(open=True, label="Driving Video"): video_input = gr.Video(width=512, elem_id="input_vid",) # , height=550) gr.Examples( examples=[ ["./data/driving/driving6.mp4"], ["./data/driving/driving1.mp4"], ["./data/driving/driving2.mp4"], ["./data/driving/driving4.mp4"], ["./data/driving/driving8.mp4"], ], inputs=[video_input], visible=True, ) with gr.Row(): with gr.Column(scale=1): with gr.Row(): # Buttons now within a single Row #edit_btn = gr.Button("Edit", elem_id="button_edit",) animate_btn = gr.Button("Animate", elem_id="button_animate") with gr.Row(): clear_btn = gr.Button("Clear", elem_id="button_clear") with gr.Column(scale=1): with gr.Row(): with gr.Accordion(open=True, label="Edited Source Image"): #image_output.render() image_output = gr.Image(label="Output Image", elem_id="output_img", type='numpy', interactive=False, width=512)#.render() with gr.Accordion(open=True, label="Animated Video"): #video_output.render() video_output = gr.Video(label="Output Video", elem_id="output_vid", width=512)#.render() with gr.Accordion("Control Panel (Using Sliders to Edit Image)", open=True): with gr.Tab("Head"): with gr.Row(): for k in labels_k[:3]: slider = gr.Slider(minimum=-1.0, maximum=0.5, value=0, label=k, elem_id="slider_"+str(k)) inputs_s.append(slider) with gr.Row(): for k in labels_k[3:6]: slider = gr.Slider(minimum=-0.5, maximum=0.5, value=0, label=k, elem_id="slider_"+str(k)) inputs_s.append(slider) with gr.Tab("Mouth"): with gr.Row(): for k in labels_k[6:8]: slider = gr.Slider(minimum=-0.4, maximum=0.4, value=0, label=k, elem_id="slider_"+str(k)) inputs_s.append(slider) with gr.Row(): for k in labels_k[8:10]: slider = gr.Slider(minimum=-0.4, maximum=0.4, value=0, label=k, elem_id="slider_"+str(k)) inputs_s.append(slider) with gr.Tab("Eyes"): with gr.Row(): for k in labels_k[10:12]: slider = gr.Slider(minimum=-0.4, maximum=0.4, value=0, label=k, elem_id="slider_"+str(k)) inputs_s.append(slider) with gr.Row(): for k in labels_k[12:14]: slider = gr.Slider(minimum=-0.2, maximum=0.2, value=0, label=k, elem_id="slider_"+str(k)) inputs_s.append(slider) for slider in inputs_s: slider.change( fn=edit_media, inputs=[image_input] + inputs_s, outputs=[image_output], show_progress='hidden', trigger_mode='always_last', # currently we have a latency around 450ms stream_every=0.5 ) #edit_btn.click( # fn=edit_media, # inputs=[image_input] + inputs_s, # outputs=[image_output], # show_progress=True #) animate_btn.click( fn=animate_media, inputs=[image_input, video_input] + inputs_s, outputs=[image_output, video_output], show_progress=True ) clear_btn.click( fn=clear_media, outputs=[image_output, video_output] + inputs_s ) gr.Examples( examples=[ ['./data/source/macron.png', './data/driving/driving6.mp4',-0.37,-0.34,0,0,0,0,0,0,0,0,0,0,0,0], ['./data/source/taylor.png', './data/driving/driving6.mp4', -0.31, -0.2, 0, -0.26, -0.14, 0, 0.068, 0.131, 0, 0, 0, 0, -0.058, 0.087], ['./data/source/macron.png', './data/driving/driving1.mp4', 0.14,0,-0.26,-0.29,-0.11,0,-0.13,-0.18,0,0,0,0,-0.02,0.07], ['./data/source/portrait3.png', './data/driving/driving1.mp4', -0.03,0.21,-0.31,-0.12,-0.11,0,-0.05,-0.16,0,0,0,0,-0.02,0.07], ['./data/source/einstein.png','./data/driving/driving2.mp4',-0.31,0,0,0.16,0.08,0,-0.07,0,0.13,0,0,0,0,0], ['./data/source/portrait1.png', './data/driving/driving4.mp4', 0, 0, -0.17, -0.19, 0.25, 0, 0, -0.086, 0.087, 0, 0, 0, 0, 0], ['./data/source/portrait2.png','./data/driving/driving8.mp4',0,0,-0.25,0,0,0,0,0,0,0.126,0,0,0,0], ], fn=animate_media, inputs=[image_input, video_input] + inputs_s, outputs=[image_output, video_output], )