YaohuiW commited on
Commit
1025828
·
verified ·
1 Parent(s): 73669b1

Update gradio_tabs/vid_edit.py

Browse files
Files changed (1) hide show
  1. gradio_tabs/vid_edit.py +64 -152
gradio_tabs/vid_edit.py CHANGED
@@ -37,118 +37,92 @@ labels_v = [
37
  ]
38
 
39
 
40
- @torch.compiler.allow_in_graph
41
  def load_image(img, size):
42
-
43
- img = Image.open(img).convert('RGB')
44
- w, h = img.size
45
- img = img.resize((size, size))
46
- img = np.asarray(img)
47
- img = np.copy(img)
48
  img = np.transpose(img, (2, 0, 1)) # 3 x 256 x 256
49
 
50
- return img / 255.0, w, h
51
 
52
 
53
- @torch.compiler.allow_in_graph
54
  def img_preprocessing(img_path, size):
55
- img, w, h = load_image(img_path, size) # [0, 1]
56
  img = torch.from_numpy(img).unsqueeze(0).float() # [0, 1]
57
  imgs_norm = (img - 0.5) * 2.0 # [-1, 1]
58
 
59
- return imgs_norm, w, h
60
-
61
- # Pre-compile resize transforms for better performance
62
- resize_transform_cache = {}
63
 
64
- def get_resize_transform(size):
65
- """Get cached resize transform - creates once, reuses many times"""
66
- if size not in resize_transform_cache:
67
- # Only create the transform if it doesn't exist in cache
68
- resize_transform_cache[size] = torchvision.transforms.Resize(
69
- size,
70
- interpolation=torchvision.transforms.InterpolationMode.BILINEAR,
71
- antialias=True
72
- )
73
- return resize_transform_cache[size]
74
 
75
  def resize(img, size):
76
- """Use cached resize transform"""
77
- transform = get_resize_transform((size, size))
 
 
78
  return transform(img)
79
 
80
 
81
  def resize_back(img, w, h):
82
- """Use cached resize transform for back operation"""
83
- transform = get_resize_transform((h, w))
84
- return transform(img)
85
 
 
 
86
 
87
  def vid_preprocessing(vid_path, size):
88
  vid_dict = torchvision.io.read_video(vid_path, pts_unit='sec')
89
- vid = vid_dict[0].permute(0, 3, 1, 2) # tchw
90
- _,_,h,w = vid.size()
91
  fps = vid_dict[2]['video_fps']
92
  vid_norm = (vid / 255.0 - 0.5) * 2.0 # [-1, 1]
93
 
94
- vid_norm = resize(vid_norm, size)
 
 
95
 
96
  return vid_norm, fps, w, h
97
 
98
 
99
  def img_denorm(img):
100
- img = img.clamp(-1, 1)
101
  img = (img - img.min()) / (img.max() - img.min())
102
 
103
  return img
104
 
105
 
106
  def vid_denorm(vid):
107
- vid = vid.clamp(-1, 1)
108
  vid = (vid - vid.min()) / (vid.max() - vid.min())
109
 
110
  return vid
111
 
112
 
113
  def img_postprocessing(image, w, h):
 
 
 
 
114
 
115
- img = resize_back(image, w, h)
116
-
117
- # Denormalize ON GPU (avoid early CPU transfer)
118
- img = img_denorm(img)
119
-
120
- # Single optimized CPU transfer
121
- img = img.squeeze(0).permute(1, 2, 0).contiguous() # contiguous() for fast transfer
122
- img_output = (img.cpu().numpy() * 255).astype(np.uint8) # Single CPU transfer
123
-
124
- # return the Numpy array directly, since Gradio supports it
125
- return img_output
126
-
127
-
128
- def process_first_frame(vid_path, size):
129
- vid_dict = torchvision.io.read_video(vid_path, start_pts=0, end_pts=0, pts_unit='sec')
130
- img = vid_dict[0].permute(0, 3, 1, 2) # bchw
131
- _, _, h, w = img.size()
132
- img_norm = (img / 255.0 - 0.5) * 2.0 # [-1, 1]
133
- img_norm = resize(img_norm, size)
134
-
135
- return img_norm, w, h
136
 
137
 
138
  def vid_all_save(vid_d, vid_a, w, h, fps):
139
- # vid_d: tchw
140
- # vid_a: tchw
141
 
142
- t, c, _, _ = vid_d.size()
143
- vid_d_batch = resize_back(vid_d, w, h)
144
- vid_a_batch = resize_back(vid_a, w, h)
145
-
146
- vid_d = rearrange(vid_d_batch, "t c h w -> t h w c") # T H W C
147
- vid_a = rearrange(vid_a_batch, "t c h w -> t h w c") # T H W C
148
- vid_all = torch.cat([vid_d, vid_a], dim=2)
149
 
150
- vid_a_np = (vid_denorm(vid_a).cpu().numpy() * 255).astype('uint8')
151
- vid_all_np = (vid_denorm(vid_all).cpu().numpy() * 255).astype('uint8')
152
 
153
  with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as output_path:
154
  imageio.mimwrite(output_path.name, vid_a_np, fps=fps, codec='libx264', quality=8)
@@ -160,59 +134,16 @@ def vid_all_save(vid_d, vid_a, w, h, fps):
160
 
161
 
162
  def vid_edit(gen, chunk_size, device):
163
-
164
- @torch.compile
165
- def compiled_enc_img(image_tensor, selected_s):
166
- """Compiled version of just the model inference"""
167
- return gen.enc_img(image_tensor, labels_v, selected_s)
168
-
169
- @torch.compile
170
- def compiled_dec_img(z_s2r, alpha_r2s, feat_rgb):
171
- """Compiled version of just the model inference"""
172
- return gen.dec_img(z_s2r, alpha_r2s, feat_rgb)
173
-
174
- @torch.compile
175
- def compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target_batch):
176
- """Compiled version of animate_batch for animation tab"""
177
- return gen.dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target_batch)
178
-
179
- # Pre-warm the compiled model with dummy data to reduce first-run compilation time
180
- def _warmup_model():
181
- """Pre-warm the model compilation with representative shapes"""
182
- print("[img_edit] Pre-warming model compilation...")
183
- dummy_image = torch.randn(1, 3, 512, 512, device=device)
184
- dummy_video = torch.randn(chunk_size, 3, 512, 512, device=device)
185
- dummy_selected_s = [0.0] * len(labels_v)
186
-
187
- try:
188
- with torch.inference_mode():
189
- z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(dummy_image, dummy_selected_s)
190
- _ = compiled_dec_img(z_s2r, alpha_r2s, feat_rgb)
191
- print("[img_edit] Model pre-warming completed successfully")
192
- except Exception as e:
193
- print(f"[img_edit] Model pre-warming failed (will compile on first use): {e}")
194
-
195
- try:
196
- with torch.inference_mode():
197
- z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(dummy_image, dummy_selected_s)
198
- _ = compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, dummy_video[0], dummy_video)
199
- print("[img_animation] Model pre-warming completed successfully")
200
- except Exception as e:
201
- print(f"[img_animation] Model pre-warming failed (will compile on first use): {e}")
202
-
203
- # Pre-warm the model
204
- _warmup_model()
205
-
206
-
207
  @spaces.GPU
208
- @torch.inference_mode()
209
  def edit_img(video, *selected_s):
210
 
211
- image_tensor, w, h = process_first_frame(video, 512)
212
- image_tensor = image_tensor.to(device)
 
213
 
214
- z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(image_tensor, selected_s)
215
- edited_image_tensor = compiled_dec_img(z_s2r, alpha_r2s, feat_rgb)
216
 
217
  # de-norm
218
  edited_image = img_postprocessing(edited_image_tensor, w, h)
@@ -220,37 +151,21 @@ def vid_edit(gen, chunk_size, device):
220
  return edited_image
221
 
222
  @spaces.GPU
223
- @torch.inference_mode()
224
  def edit_vid(video, *selected_s):
225
 
226
  video_target_tensor, fps, w, h = vid_preprocessing(video, 512)
227
  video_target_tensor = video_target_tensor.to(device)
228
 
229
- img_start = video_target_tensor[0:1, :, :, :]
230
-
231
- res = []
232
- t = video_target_tensor.size(1)
233
- chunks = t // chunk_size
234
-
235
-
236
- z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(img_start, selected_s)
237
- for i in range(chunks + 1):
238
- if i == chunks:
239
- img_target_batch = video_target_tensor[i * chunk_size:, :, :, :]
240
- img_animated_batch = compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target_batch)
241
- else:
242
- img_target_batch = video_target_tensor[i * chunk_size:(i + 1) * chunk_size, :, :, :]
243
- img_animated_batch = compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target_batch)
244
-
245
- res.append(img_animated_batch)
246
- edited_video_tensor = torch.cat(res, dim=0) # TCHW
247
- edited_image_tensor = edited_video_tensor[0:1,:,:,:]
248
 
249
  # de-norm
250
  animated_video, animated_all_video = vid_all_save(video_target_tensor, edited_video_tensor, w, h, fps)
251
  edited_image = img_postprocessing(edited_image_tensor, w, h)
252
 
253
- return edited_image, animated_video, animated_all_video
 
254
 
255
  def clear_media():
256
  return None, None, None, *([0] * len(labels_k))
@@ -295,7 +210,7 @@ def vid_edit(gen, chunk_size, device):
295
  video_all_output = gr.Video(label="Videos", elem_id="output_vid_all")
296
 
297
  with gr.Column(scale=1):
298
- with gr.Accordion("Control Panel - Using Sliders to Edit Image", open=True):
299
  with gr.Tab("Head"):
300
  with gr.Row():
301
  for k in labels_k[:3]:
@@ -329,20 +244,17 @@ def vid_edit(gen, chunk_size, device):
329
  with gr.Row():
330
  with gr.Column(scale=1):
331
  with gr.Row(): # Buttons now within a single Row
332
- #edit_btn = gr.Button("Edit",elem_id="button_edit")
333
- animate_btn = gr.Button("Generate",elem_id="button_generate")
334
  clear_btn = gr.Button("Clear",elem_id="button_clear")
335
-
336
- for slider in inputs_s:
337
- slider.change(
338
- fn=edit_img,
339
- inputs=[video_input] + inputs_s,
340
- outputs=[image_output],
341
- show_progress='hidden',
342
- trigger_mode='always_last',
343
- # currently we have a latency around 450ms
344
- stream_every=0.5
345
- )
346
 
347
  animate_btn.click(
348
  fn=edit_vid,
@@ -368,9 +280,9 @@ def vid_edit(gen, chunk_size, device):
368
  ['./data/driving/driving9.mp4', 0, 0, 0, 0, 0, 0, 0,
369
  0, 0, 0, 0, 0, -0.1, 0.07],
370
  ],
371
- fn=edit_vid,
372
  inputs=[video_input] + inputs_s,
373
- outputs=[image_output, video_output, video_all_output],
374
  )
375
 
376
 
 
37
  ]
38
 
39
 
 
40
  def load_image(img, size):
41
+ # img = Image.open(filename).convert('RGB')
42
+ if not isinstance(img, np.ndarray):
43
+ img = Image.open(img).convert('RGB')
44
+ img = img.resize((size, size))
45
+ img = np.asarray(img)
 
46
  img = np.transpose(img, (2, 0, 1)) # 3 x 256 x 256
47
 
48
+ return img / 255.0
49
 
50
 
 
51
  def img_preprocessing(img_path, size):
52
+ img = load_image(img_path, size) # [0, 1]
53
  img = torch.from_numpy(img).unsqueeze(0).float() # [0, 1]
54
  imgs_norm = (img - 0.5) * 2.0 # [-1, 1]
55
 
56
+ return imgs_norm
 
 
 
57
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  def resize(img, size):
60
+ transform = torchvision.transforms.Compose([
61
+ torchvision.transforms.Resize((size, size), antialias=True),
62
+ ])
63
+
64
  return transform(img)
65
 
66
 
67
  def resize_back(img, w, h):
68
+ transform = torchvision.transforms.Compose([
69
+ torchvision.transforms.Resize((h, w), antialias=True),
70
+ ])
71
 
72
+ return transform(img)
73
+
74
 
75
  def vid_preprocessing(vid_path, size):
76
  vid_dict = torchvision.io.read_video(vid_path, pts_unit='sec')
77
+ vid = vid_dict[0].permute(0, 3, 1, 2).unsqueeze(0) # btchw
78
+ _,_,_,h,w = vid.size()
79
  fps = vid_dict[2]['video_fps']
80
  vid_norm = (vid / 255.0 - 0.5) * 2.0 # [-1, 1]
81
 
82
+ vid_norm = torch.cat([
83
+ resize(vid_norm[:, i, :, :, :], size).unsqueeze(1) for i in range(vid.size(1))
84
+ ], dim=1)
85
 
86
  return vid_norm, fps, w, h
87
 
88
 
89
  def img_denorm(img):
90
+ img = img.clamp(-1, 1).cpu()
91
  img = (img - img.min()) / (img.max() - img.min())
92
 
93
  return img
94
 
95
 
96
  def vid_denorm(vid):
97
+ vid = vid.clamp(-1, 1).cpu()
98
  vid = (vid - vid.min()) / (vid.max() - vid.min())
99
 
100
  return vid
101
 
102
 
103
  def img_postprocessing(image, w, h):
104
+ image = resize_back(image, w, h)
105
+ image = image.permute(0, 2, 3, 1)
106
+ edited_image = img_denorm(image)
107
+ img_output = (edited_image[0].numpy() * 255).astype(np.uint8)
108
 
109
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
110
+ imageio.imwrite(temp_file.name, img_output, quality=6)
111
+ return temp_file.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
 
114
  def vid_all_save(vid_d, vid_a, w, h, fps):
 
 
115
 
116
+ b,t,c,_,_ = vid_d.size()
117
+ vid_d_batch = resize_back(rearrange(vid_d, "b t c h w -> (b t) c h w"), w, h)
118
+ vid_a_batch = resize_back(rearrange(vid_a, "b c t h w -> (b t) c h w"), w, h)
119
+
120
+ vid_d = rearrange(vid_d_batch, "(b t) c h w -> b t h w c", b=b) # B T H W C
121
+ vid_a = rearrange(vid_a_batch, "(b t) c h w -> b t h w c", b=b) # B T H W C
122
+ vid_all = torch.cat([vid_d, vid_a], dim=3)
123
 
124
+ vid_a_np = (vid_denorm(vid_a[0]).numpy() * 255).astype('uint8')
125
+ vid_all_np = (vid_denorm(vid_all[0]).numpy() * 255).astype('uint8')
126
 
127
  with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as output_path:
128
  imageio.mimwrite(output_path.name, vid_a_np, fps=fps, codec='libx264', quality=8)
 
134
 
135
 
136
  def vid_edit(gen, chunk_size, device):
137
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  @spaces.GPU
139
+ @torch.no_grad()
140
  def edit_img(video, *selected_s):
141
 
142
+ vid_target_tensor, fps, w, h = vid_preprocessing(video, 512)
143
+ video_target_tensor = vid_target_tensor.to(device)
144
+ image_tensor = video_target_tensor[:,0,:,:,:]
145
 
146
+ edited_image_tensor = gen.edit_img(image_tensor, labels_v, selected_s)
 
147
 
148
  # de-norm
149
  edited_image = img_postprocessing(edited_image_tensor, w, h)
 
151
  return edited_image
152
 
153
  @spaces.GPU
154
+ @torch.no_grad()
155
  def edit_vid(video, *selected_s):
156
 
157
  video_target_tensor, fps, w, h = vid_preprocessing(video, 512)
158
  video_target_tensor = video_target_tensor.to(device)
159
 
160
+ edited_video_tensor = gen.edit_vid_batch(video_target_tensor, labels_v, selected_s, chunk_size)
161
+ edited_image_tensor = edited_video_tensor[:,:,0,:,:]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
  # de-norm
164
  animated_video, animated_all_video = vid_all_save(video_target_tensor, edited_video_tensor, w, h, fps)
165
  edited_image = img_postprocessing(edited_image_tensor, w, h)
166
 
167
+ return edited_image, animated_video, animated_all_video
168
+
169
 
170
  def clear_media():
171
  return None, None, None, *([0] * len(labels_k))
 
210
  video_all_output = gr.Video(label="Videos", elem_id="output_vid_all")
211
 
212
  with gr.Column(scale=1):
213
+ with gr.Accordion("Control Panel", open=True):
214
  with gr.Tab("Head"):
215
  with gr.Row():
216
  for k in labels_k[:3]:
 
244
  with gr.Row():
245
  with gr.Column(scale=1):
246
  with gr.Row(): # Buttons now within a single Row
247
+ edit_btn = gr.Button("Edit",elem_id="button_edit")
 
248
  clear_btn = gr.Button("Clear",elem_id="button_clear")
249
+ with gr.Row():
250
+ animate_btn = gr.Button("Generate",elem_id="button_generate")
251
+
252
+ edit_btn.click(
253
+ fn=edit_img,
254
+ inputs=[video_input] + inputs_s,
255
+ outputs=[image_output],
256
+ show_progress=True
257
+ )
 
 
258
 
259
  animate_btn.click(
260
  fn=edit_vid,
 
280
  ['./data/driving/driving9.mp4', 0, 0, 0, 0, 0, 0, 0,
281
  0, 0, 0, 0, 0, -0.1, 0.07],
282
  ],
283
+ fn=edit_vid,
284
  inputs=[video_input] + inputs_s,
285
+ outputs=[image_output, video_output, video_all_output],
286
  )
287
 
288