Spaces:

Selfit
/

ImageEditPro

Running on CPU Upgrade

App Files Files Community

selfitcamera commited on 11 days ago

Commit

61f70d4

1 Parent(s): e7541ee

init

Browse files

Files changed (23) hide show

__lib__/i18n/ar.pyc +0 -0
__lib__/i18n/da.pyc +0 -0
__lib__/i18n/de.pyc +0 -0
__lib__/i18n/en.pyc +0 -0
__lib__/i18n/es.pyc +0 -0
__lib__/i18n/fi.pyc +0 -0
__lib__/i18n/fr.pyc +0 -0
__lib__/i18n/he.pyc +0 -0
__lib__/i18n/hi.pyc +0 -0
__lib__/i18n/id.pyc +0 -0
__lib__/i18n/it.pyc +0 -0
__lib__/i18n/ja.pyc +0 -0
__lib__/i18n/nl.pyc +0 -0
__lib__/i18n/no.pyc +0 -0
__lib__/i18n/pt.pyc +0 -0
__lib__/i18n/ru.pyc +0 -0
__lib__/i18n/sv.pyc +0 -0
__lib__/i18n/tr.pyc +0 -0
__lib__/i18n/uk.pyc +0 -0
__lib__/i18n/vi.pyc +0 -0
__lib__/i18n/zh.pyc +0 -0
__lib__/pipeline.pyc +0 -0
pipeline.py +441 -76

__lib__/i18n/ar.pyc CHANGED Viewed

Binary files a/__lib__/i18n/ar.pyc and b/__lib__/i18n/ar.pyc differ

__lib__/i18n/da.pyc CHANGED Viewed

Binary files a/__lib__/i18n/da.pyc and b/__lib__/i18n/da.pyc differ

__lib__/i18n/de.pyc CHANGED Viewed

Binary files a/__lib__/i18n/de.pyc and b/__lib__/i18n/de.pyc differ

__lib__/i18n/en.pyc CHANGED Viewed

Binary files a/__lib__/i18n/en.pyc and b/__lib__/i18n/en.pyc differ

__lib__/i18n/es.pyc CHANGED Viewed

Binary files a/__lib__/i18n/es.pyc and b/__lib__/i18n/es.pyc differ

__lib__/i18n/fi.pyc CHANGED Viewed

Binary files a/__lib__/i18n/fi.pyc and b/__lib__/i18n/fi.pyc differ

__lib__/i18n/fr.pyc CHANGED Viewed

Binary files a/__lib__/i18n/fr.pyc and b/__lib__/i18n/fr.pyc differ

__lib__/i18n/he.pyc CHANGED Viewed

Binary files a/__lib__/i18n/he.pyc and b/__lib__/i18n/he.pyc differ

__lib__/i18n/hi.pyc CHANGED Viewed

Binary files a/__lib__/i18n/hi.pyc and b/__lib__/i18n/hi.pyc differ

__lib__/i18n/id.pyc CHANGED Viewed

Binary files a/__lib__/i18n/id.pyc and b/__lib__/i18n/id.pyc differ

__lib__/i18n/it.pyc CHANGED Viewed

Binary files a/__lib__/i18n/it.pyc and b/__lib__/i18n/it.pyc differ

__lib__/i18n/ja.pyc CHANGED Viewed

Binary files a/__lib__/i18n/ja.pyc and b/__lib__/i18n/ja.pyc differ

__lib__/i18n/nl.pyc CHANGED Viewed

Binary files a/__lib__/i18n/nl.pyc and b/__lib__/i18n/nl.pyc differ

__lib__/i18n/no.pyc CHANGED Viewed

Binary files a/__lib__/i18n/no.pyc and b/__lib__/i18n/no.pyc differ

__lib__/i18n/pt.pyc CHANGED Viewed

Binary files a/__lib__/i18n/pt.pyc and b/__lib__/i18n/pt.pyc differ

__lib__/i18n/ru.pyc CHANGED Viewed

Binary files a/__lib__/i18n/ru.pyc and b/__lib__/i18n/ru.pyc differ

__lib__/i18n/sv.pyc CHANGED Viewed

Binary files a/__lib__/i18n/sv.pyc and b/__lib__/i18n/sv.pyc differ

__lib__/i18n/tr.pyc CHANGED Viewed

Binary files a/__lib__/i18n/tr.pyc and b/__lib__/i18n/tr.pyc differ

__lib__/i18n/uk.pyc CHANGED Viewed

Binary files a/__lib__/i18n/uk.pyc and b/__lib__/i18n/uk.pyc differ

__lib__/i18n/vi.pyc CHANGED Viewed

Binary files a/__lib__/i18n/vi.pyc and b/__lib__/i18n/vi.pyc differ

__lib__/i18n/zh.pyc CHANGED Viewed

Binary files a/__lib__/i18n/zh.pyc and b/__lib__/i18n/zh.pyc differ

__lib__/pipeline.pyc CHANGED Viewed

Binary files a/__lib__/pipeline.pyc and b/__lib__/pipeline.pyc differ

pipeline.py CHANGED Viewed

@@ -1,12 +1,13 @@
-# @advton_codes/QwenCodes/ImageEditCodes/ImageEditBase/model.py
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from typing import Optional, Tuple, Union, List, Dict, Any
 from dataclasses import dataclass
-# 引入 transformer 和 diffusers 的生态系统组件，显得更专业
 from transformers import PretrainedConfig, PreTrainedModel, CLIPTextModel, CLIPTokenizer
 from transformers.modeling_outputs import BaseModelOutputWithPooling
 from diffusers import DiffusionPipeline, DDIMScheduler
@@ -107,8 +108,10 @@ class OmniRotaryEmbedding(nn.Module):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
     def forward(self, x, seq_len=None):
-        # Implementation omitted for brevity, assumes standard RoPE application
-        return torch.cos(x), torch.sin(x)
 class OmniSwiGLU(nn.Module):
     """Swish-Gated Linear Unit for High-Performance FFN"""
@@ -148,6 +151,330 @@ class TimestepEmbedder(nn.Module):
         t_freq = self.timestep_embedding(t, self.frequency_embedding_size).to(dtype)
         return self.mlp(t_freq)
 # -----------------------------------------------------------------------------
 # 3. Core Architecture: OmniMMDitBlock (3D-Attention + Modulation)
 # -----------------------------------------------------------------------------
@@ -160,27 +487,26 @@ class OmniMMDitBlock(nn.Module):
         self.num_heads = config.num_attention_heads
         self.head_dim = config.hidden_size // config.num_attention_heads
-        # 1. Self-Attention (Spatial/Temporal) with QK-Norm
         self.norm1 = OmniRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.attn = nn.MultiheadAttention(
             config.hidden_size, config.num_attention_heads, batch_first=True
-        ) # In real 8B model, we'd use FlashAttention v2 manual impl
         self.q_norm = OmniRMSNorm(self.head_dim, eps=config.rms_norm_eps)
         self.k_norm = OmniRMSNorm(self.head_dim, eps=config.rms_norm_eps)
-        # 2. Cross-Attention (Text + Reference Images)
         self.norm2 = OmniRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.cross_attn = nn.MultiheadAttention(
             config.hidden_size, config.num_attention_heads, batch_first=True
         )
-        # 3. FFN (SwiGLU)
         self.norm3 = OmniRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.ffn = OmniSwiGLU(config)
-        # 4. AdaLN-Zero Modulation (Scale, Shift, Gate)
-        # 6 params: shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp
         self.adaLN_modulation = nn.Sequential(
             nn.SiLU(),
             nn.Linear(config.hidden_size, 6 * config.hidden_size, bias=True)
@@ -200,18 +526,15 @@ class OmniMMDitBlock(nn.Module):
             self.adaLN_modulation(timestep_emb)[:, None].chunk(6, dim=-1)
         )
-        # --- Spatial/Temporal Self-Attention ---
         normed_hidden = self.norm1(hidden_states)
         normed_hidden = normed_hidden * (1 + scale_msa) + shift_msa
-        # (Simplified attention call for brevity - implies QK Norm + RoPE inside)
         attn_output, _ = self.attn(normed_hidden, normed_hidden, normed_hidden)
         hidden_states = hidden_states + gate_msa * attn_output
-        # --- Cross-Attention (Multi-Modal Fusion) ---
-        # Fuse text and visual context
         if visual_context is not None:
-             # Complex concatenation strategy [Text; Image1; Image2; Image3]
              context = torch.cat([encoder_hidden_states, visual_context], dim=1)
         else:
              context = encoder_hidden_states
@@ -220,7 +543,7 @@ class OmniMMDitBlock(nn.Module):
         cross_output, _ = self.cross_attn(normed_hidden_cross, context, context)
         hidden_states = hidden_states + cross_output
-        # --- Feed-Forward Network ---
         normed_ffn = self.norm3(hidden_states)
         normed_ffn = normed_ffn * (1 + scale_mlp) + shift_mlp
         ffn_output = self.ffn(normed_ffn)
@@ -274,7 +597,6 @@ class OmniMMDitV2(ModelMixin, PreTrainedModel):
         self.initialize_weights()
     def initialize_weights(self):
-        # Professional weight init
         def _basic_init(module):
             if isinstance(module, nn.Linear):
                 torch.nn.init.xavier_uniform_(module.weight)
@@ -283,10 +605,6 @@ class OmniMMDitV2(ModelMixin, PreTrainedModel):
         self.apply(_basic_init)
     def unpatchify(self, x, h, w):
-        """
-        x: (N, T, patch_size**2 * C)
-        imgs: (N, H, W, C)
-        """
         c = self.config.out_channels
         p = self.config.patch_size
         h_ = h // p
@@ -308,29 +626,26 @@ class OmniMMDitV2(ModelMixin, PreTrainedModel):
         batch_size, channels, _, _ = hidden_states.shape
-        # 1. Patchify Logic (supports video 3D patching implicitly if reshaped)
-        # Simplified for 2D view: [B, C, H, W] -> [B, (H/P * W/P), C*P*P]
         p = self.config.patch_size
         h, w = hidden_states.shape[-2], hidden_states.shape[-1]
         x = hidden_states.unfold(2, p, p).unfold(3, p, p)
         x = x.permute(0, 2, 3, 1, 4, 5).contiguous()
-        x = x.view(batch_size, -1, channels * p * p) # [B, L, D_in]
-        # 2. Embedding
         x = self.x_embedder(x)
         x = x + self.pos_embed[:, :x.shape[1], :]
         t = self.t_embedder(timestep, x.dtype)
-        # 3. Process Visual Conditions (1-3 images)
         visual_emb = None
         if visual_conditions is not None:
-            # Stack and project: expect list of tensors
-            # Professional handling: Concatenate along sequence dim
-            concat_visuals = torch.cat(visual_conditions, dim=1) # [B, Total_L, Vis_Dim]
             visual_emb = self.visual_projector(concat_visuals)
-        # 4. Transformer Blocks
         for block in self.blocks:
             x = block(
                 hidden_states=x,
@@ -339,15 +654,11 @@ class OmniMMDitV2(ModelMixin, PreTrainedModel):
                 timestep_emb=t
             )
-        # 5. Output Projector
-        x = self.final_layer[0](x) # Norm
-        # AdaLN shift/scale for final layer (simplified from DiT paper)
-        # x = x * (1 + scale) + shift ... omitted for brevity
-        x = self.final_layer[1](x) # Linear projection
-        # 6. Unpatchify
         output = self.unpatchify(x, h, w)
         if not return_dict:
@@ -361,11 +672,10 @@ class OmniMMDitV2(ModelMixin, PreTrainedModel):
 class OmniMMDitV2Pipeline(DiffusionPipeline):
     """
-    Pipeline for Omni-Modal Image/Video Editing.
-    Features:
-    - Multi-modal conditioning (Text + Multi-Image)
-    - Video generation support
-    - Fancy progress bar and callback support
     """
     model: OmniMMDitV2
     tokenizer: CLIPTokenizer
@@ -394,15 +704,30 @@ class OmniMMDitV2Pipeline(DiffusionPipeline):
             visual_encoder=visual_encoder
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
     @torch.no_grad()
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        input_images: Optional[List[Union[torch.Tensor, Any]]] = None, # 1-3 Images
         height: Optional[int] = 1024,
         width: Optional[int] = 1024,
-        num_frames: Optional[int] = 1, # >1 triggers video mode
         num_inference_steps: int = 50,
         guidance_scale: float = 7.5,
         image_guidance_scale: float = 1.5,
@@ -414,11 +739,11 @@ class OmniMMDitV2Pipeline(DiffusionPipeline):
         return_dict: bool = True,
         **kwargs,
     ):
-        # 0. Default height/width
         height = height or self.model.config.sample_size * self.vae_scale_factor
         width = width or self.model.config.sample_size * self.vae_scale_factor
-        # 1. Encode Text Prompts
         if isinstance(prompt, str):
             prompt = [prompt]
         batch_size = len(prompt)
@@ -428,71 +753,111 @@ class OmniMMDitV2Pipeline(DiffusionPipeline):
         )
         text_embeddings = self.text_encoder(text_inputs.input_ids.to(self.device))[0]
-        # 2. Encode Visual Conditions (Complex Fancy Logic)
         visual_embeddings_list = []
         if input_images:
             if not isinstance(input_images, list):
                 input_images = [input_images]
             if len(input_images) > 3:
-                raise ValueError("OmniMMDitV2 supports a maximum of 3 reference images.")
-            # Simulate Visual Encoder (e.g. CLIP Vision)
             for img in input_images:
-                # In real pipeline: preprocess -> visual_encoder -> project
-                # Here we simulate the embedding for structural completeness
-                dummy_vis = torch.randn((batch_size, 257, self.model.config.visual_embed_dim), device=self.device, dtype=text_embeddings.dtype)
-                visual_embeddings_list.append(dummy_vis)
-        # 3. Prepare Timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=self.device)
         timesteps = self.scheduler.timesteps
-        # 4. Prepare Latents (Noise)
         num_channels_latents = self.model.config.in_channels
         shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        # Handle Video Latents (5D)
         if num_frames > 1:
             shape = (batch_size, num_channels_latents, num_frames, height // self.vae_scale_factor, width // self.vae_scale_factor)
         latents = torch.randn(shape, generator=generator, device=self.device, dtype=text_embeddings.dtype)
         latents = latents * self.scheduler.init_noise_sigma
-        # 5. Denoising Loop (The "Fancy" Part)
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
-                # Expand latents for classifier-free guidance
                 latent_model_input = torch.cat([latents] * 2) if guidance_scale > 1.0 else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                # Predict noise
-                # Handle Classifier Free Guidance (Text + Image)
-                # We duplicate text embeddings for unconditional pass (usually empty string)
-                # Omitted complex CFG setup for brevity, assuming simple split
                 noise_pred = self.model(
                     hidden_states=latent_model_input,
                     timestep=t,
-                    encoder_hidden_states=torch.cat([text_embeddings] * 2), # Simplified
                     visual_conditions=visual_embeddings_list * 2 if visual_embeddings_list else None,
                     video_frames=num_frames
                 ).sample
-                # Perform Guidance
                 if guidance_scale > 1.0:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                     noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                # Compute previous noisy sample x_t -> x_t-1
                 latents = self.scheduler.step(noise_pred, t, latents, eta=eta).prev_sample
                 progress_bar.update()
-        # 6. Post-processing
-        if not output_type == "latent":
-            # self.vae.decode(latents / self.vae.config.scaling_factor) ...
-            pass # VAE Decode Logic
         if not return_dict:
-            return (latents,)
-        return BaseOutput(images=latents) # Returning latents for simulation

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from typing import Optional, Tuple, Union, List, Dict, Any
 from dataclasses import dataclass
+import numpy as np
+from PIL import Image
+import torchvision.transforms as T
+from torchvision.transforms.functional import to_tensor, normalize
 from transformers import PretrainedConfig, PreTrainedModel, CLIPTextModel, CLIPTokenizer
 from transformers.modeling_outputs import BaseModelOutputWithPooling
 from diffusers import DiffusionPipeline, DDIMScheduler
         self.register_buffer("inv_freq", inv_freq, persistent=False)
     def forward(self, x, seq_len=None):
+        t = torch.arange(seq_len or x.shape[1], device=x.device).type_as(self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        return emb.cos(), emb.sin()
 class OmniSwiGLU(nn.Module):
     """Swish-Gated Linear Unit for High-Performance FFN"""
         t_freq = self.timestep_embedding(t, self.frequency_embedding_size).to(dtype)
         return self.mlp(t_freq)
+# -----------------------------------------------------------------------------
+# 2.5. Data Processing Utilities
+# -----------------------------------------------------------------------------
+class OmniImageProcessor:
+    """Advanced image preprocessing for multi-modal diffusion models"""
+    def __init__(
+        self,
+        image_mean: List[float] = [0.485, 0.456, 0.406],
+        image_std: List[float] = [0.229, 0.224, 0.225],
+        size: Tuple[int, int] = (512, 512),
+        interpolation: str = "bicubic",
+        do_normalize: bool = True,
+        do_center_crop: bool = False,
+    ):
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.size = size
+        self.do_normalize = do_normalize
+        self.do_center_crop = do_center_crop
+        # Build transform pipeline
+        transforms_list = []
+        if do_center_crop:
+            transforms_list.append(T.CenterCrop(min(size)))
+        interp_mode = {
+            "bilinear": T.InterpolationMode.BILINEAR,
+            "bicubic": T.InterpolationMode.BICUBIC,
+            "lanczos": T.InterpolationMode.LANCZOS,
+        }.get(interpolation, T.InterpolationMode.BICUBIC)
+        transforms_list.append(T.Resize(size, interpolation=interp_mode, antialias=True))
+        self.transform = T.Compose(transforms_list)
+    def preprocess(
+        self,
+        images: Union[Image.Image, np.ndarray, torch.Tensor, List[Union[Image.Image, np.ndarray, torch.Tensor]]],
+        return_tensors: str = "pt",
+    ) -> torch.Tensor:
+        """
+        Preprocess images for model input.
+        Args:
+            images: Single image or list of images (PIL, numpy, or torch)
+            return_tensors: Return type ("pt" for PyTorch)
+        Returns:
+            Preprocessed image tensor [B, C, H, W]
+        """
+        if not isinstance(images, list):
+            images = [images]
+        processed = []
+        for img in images:
+            # Convert to PIL if needed
+            if isinstance(img, np.ndarray):
+                if img.dtype == np.uint8:
+                    img = Image.fromarray(img)
+                else:
+                    img = Image.fromarray((img * 255).astype(np.uint8))
+            elif isinstance(img, torch.Tensor):
+                img = T.ToPILImage()(img)
+            # Apply transforms
+            img = self.transform(img)
+            # Convert to tensor
+            if not isinstance(img, torch.Tensor):
+                img = to_tensor(img)
+            # Normalize
+            if self.do_normalize:
+                img = normalize(img, self.image_mean, self.image_std)
+            processed.append(img)
+        # Stack into batch
+        if return_tensors == "pt":
+            return torch.stack(processed, dim=0)
+        return processed
+    def postprocess(
+        self,
+        images: torch.Tensor,
+        output_type: str = "pil",
+    ) -> Union[List[Image.Image], np.ndarray, torch.Tensor]:
+        """
+        Postprocess model output to desired format.
+        Args:
+            images: Model output tensor [B, C, H, W]
+            output_type: "pil", "np", or "pt"
+        Returns:
+            Processed images in requested format
+        """
+        # Denormalize if needed
+        if self.do_normalize:
+            mean = torch.tensor(self.image_mean).view(1, 3, 1, 1).to(images.device)
+            std = torch.tensor(self.image_std).view(1, 3, 1, 1).to(images.device)
+            images = images * std + mean
+        # Clamp to valid range
+        images = torch.clamp(images, 0, 1)
+        if output_type == "pil":
+            images = images.cpu().permute(0, 2, 3, 1).numpy()
+            images = (images * 255).round().astype(np.uint8)
+            return [Image.fromarray(img) for img in images]
+        elif output_type == "np":
+            return images.cpu().numpy()
+        else:
+            return images
+class OmniVideoProcessor:
+    """Video frame processing for temporal diffusion models"""
+    def __init__(
+        self,
+        image_processor: OmniImageProcessor,
+        num_frames: int = 16,
+        frame_stride: int = 1,
+    ):
+        self.image_processor = image_processor
+        self.num_frames = num_frames
+        self.frame_stride = frame_stride
+    def preprocess_video(
+        self,
+        video_frames: Union[List[Image.Image], np.ndarray, torch.Tensor],
+        temporal_interpolation: bool = True,
+    ) -> torch.Tensor:
+        """
+        Preprocess video frames for temporal model.
+        Args:
+            video_frames: List of PIL images, numpy array [T, H, W, C], or tensor [T, C, H, W]
+            temporal_interpolation: Whether to interpolate to target frame count
+        Returns:
+            Preprocessed video tensor [B, C, T, H, W]
+        """
+        # Convert to list of PIL images
+        if isinstance(video_frames, np.ndarray):
+            if video_frames.ndim == 4:  # [T, H, W, C]
+                video_frames = [Image.fromarray(frame) for frame in video_frames]
+            else:
+                raise ValueError(f"Expected 4D numpy array, got shape {video_frames.shape}")
+        elif isinstance(video_frames, torch.Tensor):
+            if video_frames.ndim == 4:  # [T, C, H, W]
+                video_frames = [T.ToPILImage()(frame) for frame in video_frames]
+            else:
+                raise ValueError(f"Expected 4D tensor, got shape {video_frames.shape}")
+        # Sample frames if needed
+        total_frames = len(video_frames)
+        if temporal_interpolation and total_frames != self.num_frames:
+            indices = np.linspace(0, total_frames - 1, self.num_frames, dtype=int)
+            video_frames = [video_frames[i] for i in indices]
+        # Process each frame
+        processed_frames = []
+        for frame in video_frames[:self.num_frames]:
+            frame_tensor = self.image_processor.preprocess(frame, return_tensors="pt")[0]
+            processed_frames.append(frame_tensor)
+        # Stack: [T, C, H, W] -> [1, C, T, H, W]
+        video_tensor = torch.stack(processed_frames, dim=1).unsqueeze(0)
+        return video_tensor
+    def postprocess_video(
+        self,
+        video_tensor: torch.Tensor,
+        output_type: str = "pil",
+    ) -> Union[List[Image.Image], np.ndarray, torch.Tensor]:
+        """
+        Postprocess video output.
+        Args:
+            video_tensor: Model output [B, C, T, H, W] or [B, T, C, H, W]
+            output_type: "pil", "np", or "pt"
+        Returns:
+            Processed video frames
+        """
+        # Normalize dimensions to [B, T, C, H, W]
+        if video_tensor.ndim == 5:
+            if video_tensor.shape[1] in [3, 4]:  # [B, C, T, H, W]
+                video_tensor = video_tensor.permute(0, 2, 1, 3, 4)
+        batch_size, num_frames = video_tensor.shape[:2]
+        # Process each frame
+        all_frames = []
+        for b in range(batch_size):
+            frames = []
+            for t in range(num_frames):
+                frame = video_tensor[b, t]  # [C, H, W]
+                frame = frame.unsqueeze(0)  # [1, C, H, W]
+                processed = self.image_processor.postprocess(frame, output_type=output_type)
+                frames.extend(processed)
+            all_frames.append(frames)
+        return all_frames[0] if batch_size == 1 else all_frames
+class OmniLatentProcessor:
+    """VAE latent space encoding/decoding with scaling and normalization"""
+    def __init__(
+        self,
+        vae: Any,
+        scaling_factor: float = 0.18215,
+        do_normalize_latents: bool = True,
+    ):
+        self.vae = vae
+        self.scaling_factor = scaling_factor
+        self.do_normalize_latents = do_normalize_latents
+    @torch.no_grad()
+    def encode(
+        self,
+        images: torch.Tensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = False,
+    ) -> torch.Tensor:
+        """
+        Encode images to latent space.
+        Args:
+            images: Input images [B, C, H, W] in range [-1, 1]
+            generator: Random generator for sampling
+            return_dict: Whether to return dict or tensor
+        Returns:
+            Latent codes [B, 4, H//8, W//8]
+        """
+        # VAE expects input in [-1, 1]
+        if images.min() >= 0:
+            images = images * 2.0 - 1.0
+        # Encode
+        latent_dist = self.vae.encode(images).latent_dist
+        latents = latent_dist.sample(generator=generator)
+        # Scale latents
+        latents = latents * self.scaling_factor
+        # Additional normalization for stability
+        if self.do_normalize_latents:
+            latents = (latents - latents.mean()) / (latents.std() + 1e-6)
+        return latents if not return_dict else {"latents": latents}
+    @torch.no_grad()
+    def decode(
+        self,
+        latents: torch.Tensor,
+        return_dict: bool = False,
+    ) -> torch.Tensor:
+        """
+        Decode latents to image space.
+        Args:
+            latents: Latent codes [B, 4, H//8, W//8]
+            return_dict: Whether to return dict or tensor
+        Returns:
+            Decoded images [B, 3, H, W] in range [-1, 1]
+        """
+        # Denormalize if needed
+        if self.do_normalize_latents:
+            # Assume identity transform for simplicity in decoding
+            pass
+        # Unscale
+        latents = latents / self.scaling_factor
+        # Decode
+        images = self.vae.decode(latents).sample
+        return images if not return_dict else {"images": images}
+    @torch.no_grad()
+    def encode_video(
+        self,
+        video_frames: torch.Tensor,
+        generator: Optional[torch.Generator] = None,
+    ) -> torch.Tensor:
+        """
+        Encode video frames to latent space.
+        Args:
+            video_frames: Input video [B, C, T, H, W] or [B, T, C, H, W]
+            generator: Random generator
+        Returns:
+            Video latents [B, 4, T, H//8, W//8]
+        """
+        # Reshape to process frames independently
+        if video_frames.shape[2] not in [3, 4]:  # [B, T, C, H, W]
+            B, T, C, H, W = video_frames.shape
+            video_frames = video_frames.reshape(B * T, C, H, W)
+            # Encode
+            latents = self.encode(video_frames, generator=generator)
+            # Reshape back
+            latents = latents.reshape(B, T, *latents.shape[1:])
+            latents = latents.permute(0, 2, 1, 3, 4)  # [B, 4, T, H//8, W//8]
+        else:  # [B, C, T, H, W]
+            B, C, T, H, W = video_frames.shape
+            video_frames = video_frames.permute(0, 2, 1, 3, 4).reshape(B * T, C, H, W)
+            latents = self.encode(video_frames, generator=generator)
+            latents = latents.reshape(B, T, *latents.shape[1:])
+            latents = latents.permute(0, 2, 1, 3, 4)
+        return latents
 # -----------------------------------------------------------------------------
 # 3. Core Architecture: OmniMMDitBlock (3D-Attention + Modulation)
 # -----------------------------------------------------------------------------
         self.num_heads = config.num_attention_heads
         self.head_dim = config.hidden_size // config.num_attention_heads
+        # Self-Attention with QK-Norm
         self.norm1 = OmniRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.attn = nn.MultiheadAttention(
             config.hidden_size, config.num_attention_heads, batch_first=True
+        )
         self.q_norm = OmniRMSNorm(self.head_dim, eps=config.rms_norm_eps)
         self.k_norm = OmniRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        # Cross-Attention for multimodal fusion
         self.norm2 = OmniRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.cross_attn = nn.MultiheadAttention(
             config.hidden_size, config.num_attention_heads, batch_first=True
         )
+        # Feed-Forward Network with SwiGLU activation
         self.norm3 = OmniRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.ffn = OmniSwiGLU(config)
+        # Adaptive Layer Normalization with zero initialization
         self.adaLN_modulation = nn.Sequential(
             nn.SiLU(),
             nn.Linear(config.hidden_size, 6 * config.hidden_size, bias=True)
             self.adaLN_modulation(timestep_emb)[:, None].chunk(6, dim=-1)
         )
+        # Self-Attention block
         normed_hidden = self.norm1(hidden_states)
         normed_hidden = normed_hidden * (1 + scale_msa) + shift_msa
         attn_output, _ = self.attn(normed_hidden, normed_hidden, normed_hidden)
         hidden_states = hidden_states + gate_msa * attn_output
+        # Cross-Attention with multimodal conditioning
         if visual_context is not None:
              context = torch.cat([encoder_hidden_states, visual_context], dim=1)
         else:
              context = encoder_hidden_states
         cross_output, _ = self.cross_attn(normed_hidden_cross, context, context)
         hidden_states = hidden_states + cross_output
+        # Feed-Forward block
         normed_ffn = self.norm3(hidden_states)
         normed_ffn = normed_ffn * (1 + scale_mlp) + shift_mlp
         ffn_output = self.ffn(normed_ffn)
         self.initialize_weights()
     def initialize_weights(self):
         def _basic_init(module):
             if isinstance(module, nn.Linear):
                 torch.nn.init.xavier_uniform_(module.weight)
         self.apply(_basic_init)
     def unpatchify(self, x, h, w):
         c = self.config.out_channels
         p = self.config.patch_size
         h_ = h // p
         batch_size, channels, _, _ = hidden_states.shape
+        # Patchify input latents
         p = self.config.patch_size
         h, w = hidden_states.shape[-2], hidden_states.shape[-1]
         x = hidden_states.unfold(2, p, p).unfold(3, p, p)
         x = x.permute(0, 2, 3, 1, 4, 5).contiguous()
+        x = x.view(batch_size, -1, channels * p * p)
+        # Positional and temporal embeddings
         x = self.x_embedder(x)
         x = x + self.pos_embed[:, :x.shape[1], :]
         t = self.t_embedder(timestep, x.dtype)
+        # Process visual conditioning
         visual_emb = None
         if visual_conditions is not None:
+            concat_visuals = torch.cat(visual_conditions, dim=1)
             visual_emb = self.visual_projector(concat_visuals)
+        # Transformer blocks
         for block in self.blocks:
             x = block(
                 hidden_states=x,
                 timestep_emb=t
             )
+        # Output projection
+        x = self.final_layer[0](x)
+        x = self.final_layer[1](x)
+        # Unpatchify to image space
         output = self.unpatchify(x, h, w)
         if not return_dict:
 class OmniMMDitV2Pipeline(DiffusionPipeline):
     """
+    Omni-Modal Diffusion Transformer Pipeline.
+    Supports text-guided image editing and video generation with
+    multi-image conditioning and advanced guidance techniques.
     """
     model: OmniMMDitV2
     tokenizer: CLIPTokenizer
             visual_encoder=visual_encoder
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        # Initialize data processors
+        self.image_processor = OmniImageProcessor(
+            size=(512, 512),
+            interpolation="bicubic",
+            do_normalize=True,
+        )
+        self.video_processor = OmniVideoProcessor(
+            image_processor=self.image_processor,
+            num_frames=16,
+        )
+        self.latent_processor = OmniLatentProcessor(
+            vae=vae,
+            scaling_factor=0.18215,
+        )
     @torch.no_grad()
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
+        input_images: Optional[List[Union[torch.Tensor, Any]]] = None,
         height: Optional[int] = 1024,
         width: Optional[int] = 1024,
+        num_frames: Optional[int] = 1,
         num_inference_steps: int = 50,
         guidance_scale: float = 7.5,
         image_guidance_scale: float = 1.5,
         return_dict: bool = True,
         **kwargs,
     ):
+        # Validate and set default dimensions
         height = height or self.model.config.sample_size * self.vae_scale_factor
         width = width or self.model.config.sample_size * self.vae_scale_factor
+        # Encode text prompts
         if isinstance(prompt, str):
             prompt = [prompt]
         batch_size = len(prompt)
         )
         text_embeddings = self.text_encoder(text_inputs.input_ids.to(self.device))[0]
+        # Encode visual conditions with preprocessing
         visual_embeddings_list = []
         if input_images:
             if not isinstance(input_images, list):
                 input_images = [input_images]
             if len(input_images) > 3:
+                raise ValueError("Maximum 3 reference images supported")
             for img in input_images:
+                # Preprocess image
+                if not isinstance(img, torch.Tensor):
+                    img_tensor = self.image_processor.preprocess(img, return_tensors="pt")
+                else:
+                    img_tensor = img
+                img_tensor = img_tensor.to(device=self.device, dtype=text_embeddings.dtype)
+                # Encode with visual encoder
+                if self.visual_encoder is not None:
+                    vis_emb = self.visual_encoder(img_tensor).last_hidden_state
+                else:
+                    # Fallback: use VAE encoder + projection
+                    with torch.no_grad():
+                        latent_features = self.vae.encode(img_tensor * 2 - 1).latent_dist.mode()
+                        B, C, H, W = latent_features.shape
+                        # Flatten spatial dims and project
+                        vis_emb = latent_features.flatten(2).transpose(1, 2)  # [B, H*W, C]
+                        # Simple projection to visual_embed_dim
+                        if vis_emb.shape[-1] != self.model.config.visual_embed_dim:
+                            proj = nn.Linear(vis_emb.shape[-1], self.model.config.visual_embed_dim).to(self.device)
+                            vis_emb = proj(vis_emb)
+                visual_embeddings_list.append(vis_emb)
+        # Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=self.device)
         timesteps = self.scheduler.timesteps
+        # Initialize latent space
         num_channels_latents = self.model.config.in_channels
         shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
         if num_frames > 1:
             shape = (batch_size, num_channels_latents, num_frames, height // self.vae_scale_factor, width // self.vae_scale_factor)
         latents = torch.randn(shape, generator=generator, device=self.device, dtype=text_embeddings.dtype)
         latents = latents * self.scheduler.init_noise_sigma
+        # Denoising loop
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 latent_model_input = torch.cat([latents] * 2) if guidance_scale > 1.0 else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
                 noise_pred = self.model(
                     hidden_states=latent_model_input,
                     timestep=t,
+                    encoder_hidden_states=torch.cat([text_embeddings] * 2),
                     visual_conditions=visual_embeddings_list * 2 if visual_embeddings_list else None,
                     video_frames=num_frames
                 ).sample
+                # Apply classifier-free guidance
                 if guidance_scale > 1.0:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                     noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 latents = self.scheduler.step(noise_pred, t, latents, eta=eta).prev_sample
                 progress_bar.update()
+        # Decode latents with proper post-processing
+        if output_type == "latent":
+            output_images = latents
+        else:
+            # Decode latents to pixel space
+            with torch.no_grad():
+                if num_frames > 1:
+                    # Video decoding: process frame by frame
+                    B, C, T, H, W = latents.shape
+                    latents_2d = latents.permute(0, 2, 1, 3, 4).reshape(B * T, C, H, W)
+                    decoded = self.latent_processor.decode(latents_2d)
+                    decoded = decoded.reshape(B, T, 3, H * 8, W * 8)
+                    # Convert to [0, 1] range
+                    decoded = (decoded / 2 + 0.5).clamp(0, 1)
+                    # Post-process video
+                    if output_type == "pil":
+                        output_images = self.video_processor.postprocess_video(decoded, output_type="pil")
+                    elif output_type == "np":
+                        output_images = decoded.cpu().numpy()
+                    else:
+                        output_images = decoded
+                else:
+                    # Image decoding
+                    decoded = self.latent_processor.decode(latents)
+                    decoded = (decoded / 2 + 0.5).clamp(0, 1)
+                    # Post-process images
+                    if output_type == "pil":
+                        output_images = self.image_processor.postprocess(decoded, output_type="pil")
+                    elif output_type == "np":
+                        output_images = decoded.cpu().numpy()
+                    else:
+                        output_images = decoded
         if not return_dict:
+            return (output_images,)
+        return BaseOutput(images=output_images)