import tempfile

import gradio as gr
import imageio
import spaces
import torch
import torchvision
import numpy as np
from PIL import Image
from einops import rearrange

# lables
labels_k = [
	'yaw1',
	'yaw2',
	'pitch',
	'roll1',
	'roll2',
	'neck',

	'pout',
	'open->close',
	'"O" Mouth',
	'smile',

	'close->open',
	'eyebrows',
	'eyeballs1',
	'eyeballs2',

]

labels_v = [
	37, 39, 28, 15, 33, 31,
	6, 25, 16, 19,
	13, 24, 17, 26
]

@torch.compiler.allow_in_graph
def load_image(img, size):
	
	img = Image.open(img).convert('RGB')
	w, h = img.size
	img = img.resize((size, size))
	img = np.asarray(img)
	img = np.copy(img)
	img = np.transpose(img, (2, 0, 1))	# 3 x 256 x 256

	return img / 255.0, w, h


@torch.compiler.allow_in_graph
def img_preprocessing(img_path, size):
	img, w, h = load_image(img_path, size)	# [0, 1]
	img = torch.from_numpy(img).unsqueeze(0).float()  # [0, 1]
	imgs_norm = (img - 0.5) * 2.0  # [-1, 1]

	return imgs_norm, w, h


# Pre-compile resize transforms for better performance
resize_transform_cache = {}

def get_resize_transform(size):
	"""Get cached resize transform - creates once, reuses many times"""
	if size not in resize_transform_cache:
		# Only create the transform if it doesn't exist in cache
		resize_transform_cache[size] = torchvision.transforms.Resize(
			size,
			interpolation=torchvision.transforms.InterpolationMode.BILINEAR,
			antialias=True
		)
	return resize_transform_cache[size]


def resize(img, size):
	"""Use cached resize transform"""
	transform = get_resize_transform((size, size))
	return transform(img)


def resize_back(img, w, h):
	"""Use cached resize transform for back operation"""
	transform = get_resize_transform((h, w))
	return transform(img)
	

def vid_preprocessing(vid_path, size):
	vid_dict = torchvision.io.read_video(vid_path, pts_unit='sec')
	vid = vid_dict[0].permute(0, 3, 1, 2) # tchw
	fps = vid_dict[2]['video_fps']
	vid_norm = (vid / 255.0 - 0.5) * 2.0  # [-1, 1]

	#vid_norm = torch.cat([
	#	resize(vid_norm[i:i+1, :, :, :], size).unsqueeze(1) for i in range(vid.size(0))
	#], dim=1)
	vid_norm = resize(vid_norm, size) # tchw	

	return vid_norm, fps


def img_denorm(img):
	img = img.clamp(-1, 1)
	img = (img - img.min()) / (img.max() - img.min())

	return img


def vid_denorm(vid):
	vid = vid.clamp(-1, 1)
	vid = (vid - vid.min()) / (vid.max() - vid.min())

	return vid


def img_postprocessing(image, w, h):

	img = resize_back(image, w, h)

	# Denormalize ON GPU (avoid early CPU transfer)
	img = img.clamp(-1, 1)	# Still on GPU
	img = (img - img.min()) / (img.max() - img.min())  # Still on GPU

	# Single optimized CPU transfer
	img = img.squeeze(0).permute(1, 2, 0).contiguous()	# contiguous() for fast transfer
	img_output = (img.cpu().numpy() * 255).astype(np.uint8)  # Single CPU transfer

	# return the Numpy array directly, since Gradio supports it
	return img_output


def vid_postprocessing(video, w, h, fps):
	# video: TCHW

	t,c,_,_ = video.size()
	vid = resize_back(video, w, h)

	vid = vid.clamp(-1, 1)
	vid = (vid - vid.min()) / (vid.max() - vid.min())

	vid = rearrange(vid, "t c h w -> t h w c")	# T H W C
	vid_np = (vid.cpu().numpy() * 255).astype('uint8')

	with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
		imageio.mimwrite(temp_file.name, vid_np, fps=fps, codec='libx264', quality=8)
		return temp_file.name


def animation(gen, chunk_size, device):
	
	@torch.compile
	def compiled_enc_img(image_tensor, selected_s):
		"""Compiled version of just the model inference"""
		return gen.enc_img(image_tensor, labels_v, selected_s)

	@torch.compile
	def compiled_dec_img(z_s2r, alpha_r2s, feat_rgb):
		"""Compiled version of just the model inference"""
		return gen.dec_img(z_s2r, alpha_r2s, feat_rgb)

	@torch.compile
	def compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target_batch):
		"""Compiled version of animate_batch for animation tab"""
		return gen.dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target_batch)

	# Pre-warm the compiled model with dummy data to reduce first-run compilation time
	def _warmup_model():
		"""Pre-warm the model compilation with representative shapes"""
		print("[img_edit] Pre-warming model compilation...")
		dummy_image = torch.randn(1, 3, 512, 512, device=device)
		dummy_video = torch.randn(chunk_size, 3, 512, 512, device=device)
		dummy_selected_s = [0.0] * len(labels_v)

		try:
			with torch.inference_mode():
				z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(dummy_image, dummy_selected_s)
				_ = compiled_dec_img(z_s2r, alpha_r2s, feat_rgb)
			print("[img_edit] Model pre-warming completed successfully")
		except Exception as e:
			print(f"[img_edit] Model pre-warming failed (will compile on first use): {e}")

		try:
			with torch.inference_mode():
				z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(dummy_image, dummy_selected_s)
				_ = compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, dummy_video[0], dummy_video)
			print("[img_animation] Model pre-warming completed successfully")
		except Exception as e:
			print(f"[img_animation] Model pre-warming failed (will compile on first use): {e}")

	# Pre-warm the model
	_warmup_model()	


	@spaces.GPU
	@torch.inference_mode()
	def edit_media(image, *selected_s):

		image_tensor, w, h = img_preprocessing(image, 512)
		image_tensor = image_tensor.to(device)

		z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(image_tensor, selected_s)
		edited_image_tensor = compiled_dec_img(z_s2r, alpha_r2s, feat_rgb)

		# de-norm
		edited_image = img_postprocessing(edited_image_tensor, w, h)

		return edited_image

	@spaces.GPU
	@torch.inference_mode()
	def animate_media(image, video, *selected_s):

		image_tensor, w, h = img_preprocessing(image, 512)
		vid_target_tensor, fps = vid_preprocessing(video, 512)
		image_tensor = image_tensor.to(device)
		video_target_tensor = vid_target_tensor.to(device) #tchw

		#animated_video = gen.animate_batch(image_tensor, video_target_tensor, labels_v, selected_s, chunk_size)
		#edited_image = animated_video[:,:,0,:,:]
		
		img_start = video_target_tensor[0:1,:,:,:]
		#vid_target_tensor_batch = rearrange(video_target_tensor, 'b t c h w -> (b t) c h w')
		
		res = []
		t = video_target_tensor.size(1)
		chunks = t // chunk_size
		z_s2r, alpha_r2s, feat_rgb = compiled_enc_img(image_tensor, selected_s)
		#z_s2r, alpha_r2s, feat_rgb = gen.enc_img(image_tensor, labels_v, selected_s)
		for i in range(chunks+1):
			if i == chunks:
				img_target = vid_target_tensor[i*chunk_size:, :, :, :]
				img_animated = compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target)
				#img_animated_batch = gen.dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target_batch)
			else:
				img_target = vid_target_tensor[i*chunk_size:(i+1)*chunk_size, :, :, :]
				img_animated = compiled_dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target)
				#img_animated_batch = gen.dec_vid(z_s2r, alpha_r2s, feat_rgb, img_start, img_target_batch)

			res.append(img_animated)
		animated_video = torch.cat(res, dim=0) # TCHW
		edited_image = animated_video[0:1,:,:,:]

		# postprocessing
		animated_video = vid_postprocessing(animated_video, w, h, fps)
		edited_image = img_postprocessing(edited_image, w, h)
		return edited_image, animated_video


	def clear_media():
		return None, None, *([0] * len(labels_k))

	
	with gr.Tab("Image Animation"):

		inputs_s = []

		with gr.Row():
			with gr.Column(scale=1):
				with gr.Row():
					with gr.Accordion(open=True, label="Source Image"):
						image_input = gr.Image(type="filepath", elem_id="input_img", width=512)	# , height=550)
						gr.Examples(
							examples=[
								["./data/source/macron.png"],
								["./data/source/einstein.png"],
								["./data/source/taylor.png"],
								["./data/source/portrait1.png"],
								["./data/source/portrait2.png"],
								["./data/source/portrait3.png"],
							],
							inputs=[image_input],
							visible=True,
							)

					with gr.Accordion(open=True, label="Driving Video"):
						video_input = gr.Video(width=512, elem_id="input_vid",)  # , height=550)
						gr.Examples(
							examples=[
								["./data/driving/driving6.mp4"],
								["./data/driving/driving1.mp4"],
								["./data/driving/driving2.mp4"],
								["./data/driving/driving4.mp4"],
								["./data/driving/driving8.mp4"],
							],
							inputs=[video_input],
							visible=True,
							)

				with gr.Row():
					with gr.Column(scale=1):
						with gr.Row():	# Buttons now within a single Row
							#edit_btn = gr.Button("Edit", elem_id="button_edit",)
							animate_btn = gr.Button("Animate", elem_id="button_animate")
						with gr.Row():
							clear_btn = gr.Button("Clear", elem_id="button_clear")


			with gr.Column(scale=1):

				with gr.Row():
					with gr.Accordion(open=True, label="Edited Source Image"):
						#image_output.render()
						image_output = gr.Image(label="Output Image", elem_id="output_img", type='numpy', interactive=False, width=512)#.render()


					with gr.Accordion(open=True, label="Animated Video"):
						#video_output.render()
						video_output = gr.Video(label="Output Video", elem_id="output_vid", width=512)#.render()

				with gr.Accordion("Control Panel (Using Sliders to Edit Image)", open=True):
					with gr.Tab("Head"):
						with gr.Row():
							for k in labels_k[:3]:
								slider = gr.Slider(minimum=-1.0, maximum=0.5, value=0, label=k, elem_id="slider_"+str(k))
								inputs_s.append(slider)
						with gr.Row():
							for k in labels_k[3:6]:
								slider = gr.Slider(minimum=-0.5, maximum=0.5, value=0, label=k, elem_id="slider_"+str(k))
								inputs_s.append(slider)

					with gr.Tab("Mouth"):
						with gr.Row():
							for k in labels_k[6:8]:
								slider = gr.Slider(minimum=-0.4, maximum=0.4, value=0, label=k, elem_id="slider_"+str(k))
								inputs_s.append(slider)
						with gr.Row():
							for k in labels_k[8:10]:
								slider = gr.Slider(minimum=-0.4, maximum=0.4, value=0, label=k, elem_id="slider_"+str(k))
								inputs_s.append(slider)

					with gr.Tab("Eyes"):
						with gr.Row():
							for k in labels_k[10:12]:
								slider = gr.Slider(minimum=-0.4, maximum=0.4, value=0, label=k, elem_id="slider_"+str(k))
								inputs_s.append(slider)
						with gr.Row():
							for k in labels_k[12:14]:
								slider = gr.Slider(minimum=-0.2, maximum=0.2, value=0, label=k, elem_id="slider_"+str(k))
								inputs_s.append(slider)
		
		for slider in inputs_s:
			slider.change(
				fn=edit_media,
				inputs=[image_input] + inputs_s,
				outputs=[image_output],

				show_progress='hidden',

				trigger_mode='always_last',

				# currently we have a latency around 450ms
				stream_every=0.5
			)


		#edit_btn.click(
		#	fn=edit_media,
		#	inputs=[image_input] + inputs_s,
		#	outputs=[image_output],
		#	show_progress=True
		#)

		animate_btn.click(
			fn=animate_media,
			inputs=[image_input, video_input] + inputs_s,
			outputs=[image_output, video_output],
			show_progress=True
		)

		clear_btn.click(
			fn=clear_media,
			outputs=[image_output, video_output] + inputs_s
		)

		gr.Examples(
			examples=[
				['./data/source/macron.png', './data/driving/driving6.mp4',-0.37,-0.34,0,0,0,0,0,0,0,0,0,0,0,0],
				['./data/source/taylor.png', './data/driving/driving6.mp4', -0.31, -0.2, 0, -0.26, -0.14, 0, 0.068, 0.131, 0, 0, 0,
				 0, -0.058, 0.087],
				['./data/source/macron.png', './data/driving/driving1.mp4', 0.14,0,-0.26,-0.29,-0.11,0,-0.13,-0.18,0,0,0,0,-0.02,0.07],
				['./data/source/portrait3.png', './data/driving/driving1.mp4', -0.03,0.21,-0.31,-0.12,-0.11,0,-0.05,-0.16,0,0,0,0,-0.02,0.07],
				['./data/source/einstein.png','./data/driving/driving2.mp4',-0.31,0,0,0.16,0.08,0,-0.07,0,0.13,0,0,0,0,0],
				['./data/source/portrait1.png', './data/driving/driving4.mp4', 0, 0, -0.17, -0.19, 0.25, 0, 0, -0.086,
				 0.087, 0, 0, 0, 0, 0],
				['./data/source/portrait2.png','./data/driving/driving8.mp4',0,0,-0.25,0,0,0,0,0,0,0.126,0,0,0,0],
				
			],
			fn=animate_media,
			inputs=[image_input, video_input] + inputs_s,
			outputs=[image_output, video_output],
		)