Spaces:

YaohuiW
/

LIA-X

Running on Zero

App Files Files Community

YaohuiW commited on Aug 22

Commit

1025828

verified ·

1 Parent(s): 73669b1

Update gradio_tabs/vid_edit.py

Browse files

Files changed (1) hide show

gradio_tabs/vid_edit.py +64 -152

gradio_tabs/vid_edit.py CHANGED Viewed

@@ -37,118 +37,92 @@ labels_v = [
 ]
-@torch.compiler.allow_in_graph
 def load_image(img, size):
-	img = Image.open(img).convert('RGB')
-	w, h = img.size
-	img = img.resize((size, size))
-	img = np.asarray(img)
-	img = np.copy(img)
 	img = np.transpose(img, (2, 0, 1))	# 3 x 256 x 256
-	return img / 255.0, w, h
-@torch.compiler.allow_in_graph
 def img_preprocessing(img_path, size):
-	img, w, h = load_image(img_path, size)	# [0, 1]
 	img = torch.from_numpy(img).unsqueeze(0).float()  # [0, 1]
 	imgs_norm = (img - 0.5) * 2.0  # [-1, 1]
-	return imgs_norm, w, h
-# Pre-compile resize transforms for better performance
-resize_transform_cache = {}
-def get_resize_transform(size):
-	"""Get cached resize transform - creates once, reuses many times"""
-	if size not in resize_transform_cache:
-		# Only create the transform if it doesn't exist in cache
-		resize_transform_cache[size] = torchvision.transforms.Resize(
-			size,
-			interpolation=torchvision.transforms.InterpolationMode.BILINEAR,
-			antialias=True
-		)
-	return resize_transform_cache[size]
 def resize(img, size):
-	"""Use cached resize transform"""
-	transform = get_resize_transform((size, size))
 	return transform(img)
 def resize_back(img, w, h):
-	"""Use cached resize transform for back operation"""
-	transform = get_resize_transform((h, w))
-	return transform(img)
 def vid_preprocessing(vid_path, size):
 	vid_dict = torchvision.io.read_video(vid_path, pts_unit='sec')
-	vid = vid_dict[0].permute(0, 3, 1, 2)	# tchw
-	_,_,h,w = vid.size()
 	fps = vid_dict[2]['video_fps']
 	vid_norm = (vid / 255.0 - 0.5) * 2.0  # [-1, 1]
-	vid_norm = resize(vid_norm, size)
 	return vid_norm, fps, w, h
 def img_denorm(img):
-	img = img.clamp(-1, 1)
 	img = (img - img.min()) / (img.max() - img.min())
 	return img
 def vid_denorm(vid):
-	vid = vid.clamp(-1, 1)
 	vid = (vid - vid.min()) / (vid.max() - vid.min())
 	return vid
 def img_postprocessing(image, w, h):
-	img = resize_back(image, w, h)
-	# Denormalize ON GPU (avoid early CPU transfer)
-	img = img_denorm(img)
-	# Single optimized CPU transfer
-	img = img.squeeze(0).permute(1, 2, 0).contiguous()	# contiguous() for fast transfer
-	img_output = (img.cpu().numpy() * 255).astype(np.uint8)  # Single CPU transfer
-	# return the Numpy array directly, since Gradio supports it
-	return img_output
-def process_first_frame(vid_path, size):
-	vid_dict = torchvision.io.read_video(vid_path, start_pts=0, end_pts=0, pts_unit='sec')
-	img = vid_dict[0].permute(0, 3, 1, 2)  # bchw
-	_, _, h, w = img.size()
-	img_norm = (img / 255.0 - 0.5) * 2.0 # [-1, 1]
-	img_norm = resize(img_norm, size)
-	return img_norm, w, h
 def vid_all_save(vid_d, vid_a, w, h, fps):
-	# vid_d: tchw
-	# vid_a: tchw
-	t, c, _, _ = vid_d.size()
-	vid_d_batch = resize_back(vid_d, w, h)
-	vid_a_batch = resize_back(vid_a, w, h)
-	vid_d = rearrange(vid_d_batch, "t c h w -> t h w c")  # T H W C
-	vid_a = rearrange(vid_a_batch, "t c h w -> t h w c")  # T H W C
-	vid_all = torch.cat([vid_d, vid_a], dim=2)
-	vid_a_np = (vid_denorm(vid_a).cpu().numpy() * 255).astype('uint8')
-	vid_all_np = (vid_denorm(vid_all).cpu().numpy() * 255).astype('uint8')
 	with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as output_path:
 		imageio.mimwrite(output_path.name, vid_a_np, fps=fps, codec='libx264', quality=8)
@@ -160,59 +134,16 @@ def vid_all_save(vid_d, vid_a, w, h, fps):
 def vid_edit(gen, chunk_size, device):
-	@torch.compile
-	def compiled_enc_img(image_tensor, selected_s):
-		"""Compiled version of just the model inference"""
-		return gen.enc_img(image_tensor, labels_v, selected_s)
-	@torch.compile
-	def compiled_dec_img(z_s2r, alpha_r2s, feat_rgb):
-		"""Compiled version of just the model inference"""
-		return gen.dec_img(z_s2r, alpha_r2s, feat_rgb)
-	@torch.compile
-	def compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target_batch):
-		"""Compiled version of animate_batch for animation tab"""
-		return gen.dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target_batch)
-	# Pre-warm the compiled model with dummy data to reduce first-run compilation time
-	def _warmup_model():
-		"""Pre-warm the model compilation with representative shapes"""
-		print("[img_edit] Pre-warming model compilation...")
-		dummy_image = torch.randn(1, 3, 512, 512, device=device)
-		dummy_video = torch.randn(chunk_size, 3, 512, 512, device=device)
-		dummy_selected_s = [0.0] * len(labels_v)
-		try:
-			with torch.inference_mode():
-				z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(dummy_image, dummy_selected_s)
-				_ = compiled_dec_img(z_s2r, alpha_r2s, feat_rgb)
-			print("[img_edit] Model pre-warming completed successfully")
-		except Exception as e:
-			print(f"[img_edit] Model pre-warming failed (will compile on first use): {e}")
-		try:
-			with torch.inference_mode():
-				z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(dummy_image, dummy_selected_s)
-				_ = compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, dummy_video[0], dummy_video)
-			print("[img_animation] Model pre-warming completed successfully")
-		except Exception as e:
-			print(f"[img_animation] Model pre-warming failed (will compile on first use): {e}")
-	# Pre-warm the model
-	_warmup_model()
 	@spaces.GPU
-	@torch.inference_mode()
 	def edit_img(video, *selected_s):
-		image_tensor, w, h = process_first_frame(video, 512)
-		image_tensor = image_tensor.to(device)
-		z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(image_tensor, selected_s)
-		edited_image_tensor = compiled_dec_img(z_s2r, alpha_r2s, feat_rgb)
 		# de-norm
 		edited_image = img_postprocessing(edited_image_tensor, w, h)
@@ -220,37 +151,21 @@ def vid_edit(gen, chunk_size, device):
 		return edited_image
 	@spaces.GPU
-	@torch.inference_mode()
 	def edit_vid(video, *selected_s):
 		video_target_tensor, fps, w, h = vid_preprocessing(video, 512)
 		video_target_tensor = video_target_tensor.to(device)
-		img_start = video_target_tensor[0:1, :, :, :]
-		res = []
-		t = video_target_tensor.size(1)
-		chunks = t // chunk_size
-		z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(img_start, selected_s)
-		for i in range(chunks + 1):
-			if i == chunks:
-				img_target_batch = video_target_tensor[i * chunk_size:, :, :, :]
-				img_animated_batch = compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target_batch)
-			else:
-				img_target_batch = video_target_tensor[i * chunk_size:(i + 1) * chunk_size, :, :, :]
-				img_animated_batch = compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target_batch)
-			res.append(img_animated_batch)
-		edited_video_tensor = torch.cat(res, dim=0)  # TCHW
-		edited_image_tensor = edited_video_tensor[0:1,:,:,:]
 		# de-norm
 		animated_video, animated_all_video = vid_all_save(video_target_tensor, edited_video_tensor, w, h, fps)
 		edited_image = img_postprocessing(edited_image_tensor, w, h)
-		return edited_image, animated_video, animated_all_video
 	def clear_media():
 		return None, None, None, *([0] * len(labels_k))
@@ -295,7 +210,7 @@ def vid_edit(gen, chunk_size, device):
 						video_all_output = gr.Video(label="Videos", elem_id="output_vid_all")
 			with gr.Column(scale=1):
-				with gr.Accordion("Control Panel - Using Sliders to Edit Image", open=True):
 					with gr.Tab("Head"):
 						with gr.Row():
 							for k in labels_k[:3]:
@@ -329,20 +244,17 @@ def vid_edit(gen, chunk_size, device):
 				with gr.Row():
 					with gr.Column(scale=1):
 						with gr.Row():	# Buttons now within a single Row
-							#edit_btn = gr.Button("Edit",elem_id="button_edit")
-							animate_btn = gr.Button("Generate",elem_id="button_generate")
 							clear_btn = gr.Button("Clear",elem_id="button_clear")
-		for slider in inputs_s:
-			slider.change(
-				fn=edit_img,
-				inputs=[video_input] + inputs_s,
-				outputs=[image_output],
-				show_progress='hidden',
-				trigger_mode='always_last',
-				# currently we have a latency around 450ms
-				stream_every=0.5
-			)
 		animate_btn.click(
 			fn=edit_vid,
@@ -368,9 +280,9 @@ def vid_edit(gen, chunk_size, device):
 				['./data/driving/driving9.mp4', 0, 0, 0, 0, 0, 0, 0,
 				 0, 0, 0, 0, 0, -0.1, 0.07],
 			],
-			fn=edit_vid,
 			inputs=[video_input] + inputs_s,
-			outputs=[image_output, video_output, video_all_output],
 		)

 ]
 def load_image(img, size):
+	# img = Image.open(filename).convert('RGB')
+	if not isinstance(img, np.ndarray):
+		img = Image.open(img).convert('RGB')
+		img = img.resize((size, size))
+		img = np.asarray(img)
 	img = np.transpose(img, (2, 0, 1))	# 3 x 256 x 256
+	return img / 255.0
 def img_preprocessing(img_path, size):
+	img = load_image(img_path, size)  # [0, 1]
 	img = torch.from_numpy(img).unsqueeze(0).float()  # [0, 1]
 	imgs_norm = (img - 0.5) * 2.0  # [-1, 1]
+	return imgs_norm
 def resize(img, size):
+	transform = torchvision.transforms.Compose([
+		torchvision.transforms.Resize((size, size), antialias=True),
+	])
 	return transform(img)
 def resize_back(img, w, h):
+	transform = torchvision.transforms.Compose([
+		torchvision.transforms.Resize((h, w), antialias=True),
+	])
+	return transform(img)
 def vid_preprocessing(vid_path, size):
 	vid_dict = torchvision.io.read_video(vid_path, pts_unit='sec')
+	vid = vid_dict[0].permute(0, 3, 1, 2).unsqueeze(0)	# btchw
+	_,_,_,h,w = vid.size()
 	fps = vid_dict[2]['video_fps']
 	vid_norm = (vid / 255.0 - 0.5) * 2.0  # [-1, 1]
+	vid_norm = torch.cat([
+		resize(vid_norm[:, i, :, :, :], size).unsqueeze(1) for i in range(vid.size(1))
+	], dim=1)
 	return vid_norm, fps, w, h
 def img_denorm(img):
+	img = img.clamp(-1, 1).cpu()
 	img = (img - img.min()) / (img.max() - img.min())
 	return img
 def vid_denorm(vid):
+	vid = vid.clamp(-1, 1).cpu()
 	vid = (vid - vid.min()) / (vid.max() - vid.min())
 	return vid
 def img_postprocessing(image, w, h):
+	image = resize_back(image, w, h)
+	image = image.permute(0, 2, 3, 1)
+	edited_image = img_denorm(image)
+	img_output = (edited_image[0].numpy() * 255).astype(np.uint8)
+	with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
+		imageio.imwrite(temp_file.name, img_output, quality=6)
+		return temp_file.name
 def vid_all_save(vid_d, vid_a, w, h, fps):
+	b,t,c,_,_ = vid_d.size()
+	vid_d_batch = resize_back(rearrange(vid_d, "b t c h w -> (b t) c h w"), w, h)
+	vid_a_batch = resize_back(rearrange(vid_a, "b c t h w -> (b t) c h w"), w, h)
+	vid_d = rearrange(vid_d_batch, "(b t) c h w -> b t h w c", b=b) # B T H W C
+	vid_a = rearrange(vid_a_batch, "(b t) c h w -> b t h w c", b=b) # B T H W C
+	vid_all = torch.cat([vid_d, vid_a], dim=3)
+	vid_a_np = (vid_denorm(vid_a[0]).numpy() * 255).astype('uint8')
+	vid_all_np = (vid_denorm(vid_all[0]).numpy() * 255).astype('uint8')
 	with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as output_path:
 		imageio.mimwrite(output_path.name, vid_a_np, fps=fps, codec='libx264', quality=8)
 def vid_edit(gen, chunk_size, device):
 	@spaces.GPU
+	@torch.no_grad()
 	def edit_img(video, *selected_s):
+		vid_target_tensor, fps, w, h = vid_preprocessing(video, 512)
+		video_target_tensor = vid_target_tensor.to(device)
+		image_tensor = video_target_tensor[:,0,:,:,:]
+		edited_image_tensor = gen.edit_img(image_tensor, labels_v, selected_s)
 		# de-norm
 		edited_image = img_postprocessing(edited_image_tensor, w, h)
 		return edited_image
 	@spaces.GPU
+	@torch.no_grad()
 	def edit_vid(video, *selected_s):
 		video_target_tensor, fps, w, h = vid_preprocessing(video, 512)
 		video_target_tensor = video_target_tensor.to(device)
+		edited_video_tensor = gen.edit_vid_batch(video_target_tensor, labels_v, selected_s, chunk_size)
+		edited_image_tensor = edited_video_tensor[:,:,0,:,:]
 		# de-norm
 		animated_video, animated_all_video = vid_all_save(video_target_tensor, edited_video_tensor, w, h, fps)
 		edited_image = img_postprocessing(edited_image_tensor, w, h)
+		return edited_image, animated_video, animated_all_video
 	def clear_media():
 		return None, None, None, *([0] * len(labels_k))
 						video_all_output = gr.Video(label="Videos", elem_id="output_vid_all")
 			with gr.Column(scale=1):
+				with gr.Accordion("Control Panel", open=True):
 					with gr.Tab("Head"):
 						with gr.Row():
 							for k in labels_k[:3]:
 				with gr.Row():
 					with gr.Column(scale=1):
 						with gr.Row():	# Buttons now within a single Row
+							edit_btn = gr.Button("Edit",elem_id="button_edit")
 							clear_btn = gr.Button("Clear",elem_id="button_clear")
+						with gr.Row():
+							animate_btn = gr.Button("Generate",elem_id="button_generate")
+		edit_btn.click(
+			fn=edit_img,
+			inputs=[video_input] + inputs_s,
+			outputs=[image_output],
+			show_progress=True
+		)
 		animate_btn.click(
 			fn=edit_vid,
 				['./data/driving/driving9.mp4', 0, 0, 0, 0, 0, 0, 0,
 				 0, 0, 0, 0, 0, -0.1, 0.07],
 			],
+            fn=edit_vid,
 			inputs=[video_input] + inputs_s,
+            outputs=[image_output, video_output, video_all_output],
 		)