Spaces:

Selfit
/

ImageEditPro

Running on CPU Upgrade

App Files Files Community

selfitcamera commited on 5 days ago

Commit

397c271

1 Parent(s): 8f954d0

init

Browse files

Files changed (23) hide show

__lib__/i18n/ar.pyc +0 -0
__lib__/i18n/da.pyc +0 -0
__lib__/i18n/de.pyc +0 -0
__lib__/i18n/en.pyc +0 -0
__lib__/i18n/es.pyc +0 -0
__lib__/i18n/fi.pyc +0 -0
__lib__/i18n/fr.pyc +0 -0
__lib__/i18n/he.pyc +0 -0
__lib__/i18n/hi.pyc +0 -0
__lib__/i18n/id.pyc +0 -0
__lib__/i18n/it.pyc +0 -0
__lib__/i18n/ja.pyc +0 -0
__lib__/i18n/nl.pyc +0 -0
__lib__/i18n/no.pyc +0 -0
__lib__/i18n/pt.pyc +0 -0
__lib__/i18n/ru.pyc +0 -0
__lib__/i18n/sv.pyc +0 -0
__lib__/i18n/tr.pyc +0 -0
__lib__/i18n/uk.pyc +0 -0
__lib__/i18n/vi.pyc +0 -0
__lib__/i18n/zh.pyc +0 -0
__lib__/pipeline.pyc +0 -0
pipeline.py +874 -0

__lib__/i18n/ar.pyc CHANGED Viewed

Binary files a/__lib__/i18n/ar.pyc and b/__lib__/i18n/ar.pyc differ

__lib__/i18n/da.pyc CHANGED Viewed

Binary files a/__lib__/i18n/da.pyc and b/__lib__/i18n/da.pyc differ

__lib__/i18n/de.pyc CHANGED Viewed

Binary files a/__lib__/i18n/de.pyc and b/__lib__/i18n/de.pyc differ

__lib__/i18n/en.pyc CHANGED Viewed

Binary files a/__lib__/i18n/en.pyc and b/__lib__/i18n/en.pyc differ

__lib__/i18n/es.pyc CHANGED Viewed

Binary files a/__lib__/i18n/es.pyc and b/__lib__/i18n/es.pyc differ

__lib__/i18n/fi.pyc CHANGED Viewed

Binary files a/__lib__/i18n/fi.pyc and b/__lib__/i18n/fi.pyc differ

__lib__/i18n/fr.pyc CHANGED Viewed

Binary files a/__lib__/i18n/fr.pyc and b/__lib__/i18n/fr.pyc differ

__lib__/i18n/he.pyc CHANGED Viewed

Binary files a/__lib__/i18n/he.pyc and b/__lib__/i18n/he.pyc differ

__lib__/i18n/hi.pyc CHANGED Viewed

Binary files a/__lib__/i18n/hi.pyc and b/__lib__/i18n/hi.pyc differ

__lib__/i18n/id.pyc CHANGED Viewed

Binary files a/__lib__/i18n/id.pyc and b/__lib__/i18n/id.pyc differ

__lib__/i18n/it.pyc CHANGED Viewed

Binary files a/__lib__/i18n/it.pyc and b/__lib__/i18n/it.pyc differ

__lib__/i18n/ja.pyc CHANGED Viewed

Binary files a/__lib__/i18n/ja.pyc and b/__lib__/i18n/ja.pyc differ

__lib__/i18n/nl.pyc CHANGED Viewed

Binary files a/__lib__/i18n/nl.pyc and b/__lib__/i18n/nl.pyc differ

__lib__/i18n/no.pyc CHANGED Viewed

Binary files a/__lib__/i18n/no.pyc and b/__lib__/i18n/no.pyc differ

__lib__/i18n/pt.pyc CHANGED Viewed

Binary files a/__lib__/i18n/pt.pyc and b/__lib__/i18n/pt.pyc differ

__lib__/i18n/ru.pyc CHANGED Viewed

Binary files a/__lib__/i18n/ru.pyc and b/__lib__/i18n/ru.pyc differ

__lib__/i18n/sv.pyc CHANGED Viewed

Binary files a/__lib__/i18n/sv.pyc and b/__lib__/i18n/sv.pyc differ

__lib__/i18n/tr.pyc CHANGED Viewed

Binary files a/__lib__/i18n/tr.pyc and b/__lib__/i18n/tr.pyc differ

__lib__/i18n/uk.pyc CHANGED Viewed

Binary files a/__lib__/i18n/uk.pyc and b/__lib__/i18n/uk.pyc differ

__lib__/i18n/vi.pyc CHANGED Viewed

Binary files a/__lib__/i18n/vi.pyc and b/__lib__/i18n/vi.pyc differ

__lib__/i18n/zh.pyc CHANGED Viewed

Binary files a/__lib__/i18n/zh.pyc and b/__lib__/i18n/zh.pyc differ

__lib__/pipeline.pyc CHANGED Viewed

Binary files a/__lib__/pipeline.pyc and b/__lib__/pipeline.pyc differ

pipeline.py CHANGED Viewed

@@ -1058,3 +1058,877 @@ class OmniMMDitV2Pipeline(DiffusionPipeline):
             return (output_images,)
         return BaseOutput(images=output_images)

             return (output_images,)
         return BaseOutput(images=output_images)
+# -----------------------------------------------------------------------------
+# 6. Advanced Multi-Modal Window Attention Block (Audio + Video + Image)
+# -----------------------------------------------------------------------------
+@dataclass
+class MultiModalInput:
+    """Container for multi-modal inputs"""
+    image_embeds: Optional[torch.Tensor] = None      # [B, L_img, D]
+    video_embeds: Optional[torch.Tensor] = None      # [B, T_video, L_vid, D]
+    audio_embeds: Optional[torch.Tensor] = None      # [B, T_audio, L_aud, D]
+    attention_mask: Optional[torch.Tensor] = None    # [B, total_length]
+class TemporalWindowPartition(nn.Module):
+    """
+    Partition temporal sequences into windows for efficient attention.
+    Supports both uniform and adaptive windowing strategies.
+    """
+    def __init__(
+        self,
+        window_size: int = 8,
+        shift_size: int = 0,
+        use_adaptive_window: bool = False,
+    ):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.use_adaptive_window = use_adaptive_window
+    def partition(self, x: torch.Tensor) -> Tuple[torch.Tensor, Dict[str, Any]]:
+        """
+        Partition sequence into windows.
+        Args:
+            x: Input tensor [B, T, L, D] or [B, L, D]
+        Returns:
+            windowed: [B * num_windows, window_size, L, D]
+            info: Dictionary with partition information
+        """
+        if x.ndim == 3:  # Static input (image)
+            return x, {"is_temporal": False, "original_shape": x.shape}
+        B, T, L, D = x.shape
+        # Apply temporal shift for shifted window attention (Swin-Transformer style)
+        if self.shift_size > 0:
+            x = torch.roll(x, shifts=-self.shift_size, dims=1)
+        # Pad if necessary
+        pad_t = (self.window_size - T % self.window_size) % self.window_size
+        if pad_t > 0:
+            x = F.pad(x, (0, 0, 0, 0, 0, pad_t))
+        T_padded = T + pad_t
+        num_windows = T_padded // self.window_size
+        # Reshape into windows: [B, num_windows, window_size, L, D]
+        x_windowed = x.view(B, num_windows, self.window_size, L, D)
+        # Merge batch and window dims: [B * num_windows, window_size, L, D]
+        x_windowed = x_windowed.view(B * num_windows, self.window_size, L, D)
+        info = {
+            "is_temporal": True,
+            "original_shape": (B, T, L, D),
+            "num_windows": num_windows,
+            "pad_t": pad_t,
+        }
+        return x_windowed, info
+    def merge(self, x_windowed: torch.Tensor, info: Dict[str, Any]) -> torch.Tensor:
+        """
+        Merge windows back to original sequence.
+        Args:
+            x_windowed: Windowed tensor [B * num_windows, window_size, L, D]
+            info: Partition information from partition()
+        Returns:
+            x: Merged tensor [B, T, L, D] or [B, L, D]
+        """
+        if not info["is_temporal"]:
+            return x_windowed
+        B, T, L, D = info["original_shape"]
+        num_windows = info["num_windows"]
+        pad_t = info["pad_t"]
+        # Reshape: [B * num_windows, window_size, L, D] -> [B, num_windows, window_size, L, D]
+        x = x_windowed.view(B, num_windows, self.window_size, L, D)
+        # Merge windows: [B, T_padded, L, D]
+        x = x.view(B, num_windows * self.window_size, L, D)
+        # Remove padding
+        if pad_t > 0:
+            x = x[:, :-pad_t, :, :]
+        # Reverse temporal shift
+        if self.shift_size > 0:
+            x = torch.roll(x, shifts=self.shift_size, dims=1)
+        return x
+class WindowCrossAttention(nn.Module):
+    """
+    Window-based Cross Attention with support for temporal sequences.
+    Performs attention within local windows for computational efficiency.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        window_size: int = 8,
+        qkv_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        use_relative_position_bias: bool = True,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        # Query, Key, Value projections
+        self.q_proj = nn.Linear(dim, dim, bias=qkv_bias)
+        self.k_proj = nn.Linear(dim, dim, bias=qkv_bias)
+        self.v_proj = nn.Linear(dim, dim, bias=qkv_bias)
+        # QK Normalization for stability
+        self.q_norm = OmniRMSNorm(self.head_dim)
+        self.k_norm = OmniRMSNorm(self.head_dim)
+        # Attention dropout
+        self.attn_drop = nn.Dropout(attn_drop)
+        # Output projection
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        # Relative position bias (for temporal coherence)
+        self.use_relative_position_bias = use_relative_position_bias
+        if use_relative_position_bias:
+            # Temporal relative position bias
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros((2 * window_size - 1), num_heads)
+            )
+            nn.init.trunc_normal_(self.relative_position_bias_table, std=0.02)
+            # Get relative position index
+            coords = torch.arange(window_size)
+            relative_coords = coords[:, None] - coords[None, :]  # [window_size, window_size]
+            relative_coords += window_size - 1  # Shift to start from 0
+            self.register_buffer("relative_position_index", relative_coords)
+    def get_relative_position_bias(self, window_size: int) -> torch.Tensor:
+        """Generate relative position bias for attention"""
+        if not self.use_relative_position_bias:
+            return None
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index[:window_size, :window_size].reshape(-1)
+        ].reshape(window_size, window_size, -1)
+        # Permute to [num_heads, window_size, window_size]
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
+        return relative_position_bias
+    def forward(
+        self,
+        query: torch.Tensor,      # [B, T_q, L_q, D] or [B, L_q, D]
+        key: torch.Tensor,        # [B, T_k, L_k, D] or [B, L_k, D]
+        value: torch.Tensor,      # [B, T_v, L_v, D] or [B, L_v, D]
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Perform windowed cross attention.
+        Args:
+            query: Query tensor
+            key: Key tensor
+            value: Value tensor
+            attention_mask: Optional attention mask
+        Returns:
+            Output tensor with same shape as query
+        """
+        # Handle both temporal and non-temporal inputs
+        is_temporal = query.ndim == 4
+        if is_temporal:
+            B, T_q, L_q, D = query.shape
+            _, T_k, L_k, _ = key.shape
+            # Flatten temporal and spatial dims for cross attention
+            query_flat = query.reshape(B, T_q * L_q, D)
+            key_flat = key.reshape(B, T_k * L_k, D)
+            value_flat = value.reshape(B, T_k * L_k, D)
+        else:
+            B, L_q, D = query.shape
+            _, L_k, _ = key.shape
+            query_flat = query
+            key_flat = key
+            value_flat = value
+        # Project to Q, K, V
+        q = self.q_proj(query_flat)  # [B, N_q, D]
+        k = self.k_proj(key_flat)    # [B, N_k, D]
+        v = self.v_proj(value_flat)  # [B, N_v, D]
+        # Reshape for multi-head attention
+        q = q.reshape(B, -1, self.num_heads, self.head_dim).transpose(1, 2)  # [B, H, N_q, head_dim]
+        k = k.reshape(B, -1, self.num_heads, self.head_dim).transpose(1, 2)  # [B, H, N_k, head_dim]
+        v = v.reshape(B, -1, self.num_heads, self.head_dim).transpose(1, 2)  # [B, H, N_v, head_dim]
+        # Apply QK normalization
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        # Scaled dot-product attention
+        attn = (q @ k.transpose(-2, -1)) * self.scale  # [B, H, N_q, N_k]
+        # Add relative position bias if temporal
+        if is_temporal and self.use_relative_position_bias:
+            # Apply per-window bias
+            rel_bias = self.get_relative_position_bias(min(T_q, self.window_size))
+            if rel_bias is not None:
+                # Broadcast bias across spatial dimensions
+                attn = attn + rel_bias.unsqueeze(0).unsqueeze(2)
+        # Apply attention mask
+        if attention_mask is not None:
+            attn = attn.masked_fill(attention_mask.unsqueeze(1).unsqueeze(2) == 0, float('-inf'))
+        # Softmax and dropout
+        attn = F.softmax(attn, dim=-1)
+        attn = self.attn_drop(attn)
+        # Apply attention to values
+        out = (attn @ v).transpose(1, 2).reshape(B, -1, D)  # [B, N_q, D]
+        # Output projection
+        out = self.proj(out)
+        out = self.proj_drop(out)
+        # Reshape back to original shape
+        if is_temporal:
+            out = out.reshape(B, T_q, L_q, D)
+        else:
+            out = out.reshape(B, L_q, D)
+        return out
+class MultiModalFusionLayer(nn.Module):
+    """
+    Fuses multiple modalities (audio, video, image) with learnable fusion weights.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_modalities: int = 3,
+        fusion_type: str = "weighted",  # "weighted", "gated", "adaptive"
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_modalities = num_modalities
+        self.fusion_type = fusion_type
+        if fusion_type == "weighted":
+            # Learnable fusion weights
+            self.fusion_weights = nn.Parameter(torch.ones(num_modalities) / num_modalities)
+        elif fusion_type == "gated":
+            # Gated fusion with cross-modal interactions
+            self.gate_proj = nn.Sequential(
+                nn.Linear(dim * num_modalities, dim * 2),
+                nn.GELU(),
+                nn.Linear(dim * 2, num_modalities),
+                nn.Softmax(dim=-1)
+            )
+        elif fusion_type == "adaptive":
+            # Adaptive fusion with per-token gating
+            self.adaptive_gate = nn.Sequential(
+                nn.Linear(dim, dim // 2),
+                nn.GELU(),
+                nn.Linear(dim // 2, num_modalities),
+                nn.Sigmoid()
+            )
+    def forward(self, modality_features: List[torch.Tensor]) -> torch.Tensor:
+        """
+        Fuse multiple modality features.
+        Args:
+            modality_features: List of [B, L, D] tensors for each modality
+        Returns:
+            fused: Fused features [B, L, D]
+        """
+        if self.fusion_type == "weighted":
+            # Simple weighted sum
+            weights = F.softmax(self.fusion_weights, dim=0)
+            fused = sum(w * feat for w, feat in zip(weights, modality_features))
+        elif self.fusion_type == "gated":
+            # Concatenate and compute gates
+            concat_features = torch.cat(modality_features, dim=-1)  # [B, L, D * num_modalities]
+            gates = self.gate_proj(concat_features)  # [B, L, num_modalities]
+            # Apply gates
+            stacked = torch.stack(modality_features, dim=-1)  # [B, L, D, num_modalities]
+            fused = (stacked * gates.unsqueeze(2)).sum(dim=-1)  # [B, L, D]
+        elif self.fusion_type == "adaptive":
+            # Adaptive per-token fusion
+            fused_list = []
+            for feat in modality_features:
+                gate = self.adaptive_gate(feat)  # [B, L, num_modalities]
+                fused_list.append(feat.unsqueeze(-1) * gate.unsqueeze(2))
+            fused = torch.cat(fused_list, dim=-1).sum(dim=-1)  # [B, L, D]
+        return fused
+class FancyMultiModalWindowAttentionBlock(nn.Module):
+    """
+    🎯 Fancy Multi-Modal Window Attention Block
+    A state-of-the-art block that processes audio, video, and image embeddings
+    with temporal window-based cross-attention for efficient multi-modal fusion.
+    Features:
+    - ✨ Temporal windowing for audio and video (frame-by-frame processing)
+    - 🪟 Shifted window attention for better temporal coherence (Swin-style)
+    - 🔄 Cross-modal attention between all modality pairs
+    - 🎭 Adaptive multi-modal fusion with learnable gates
+    - 🚀 Efficient computation with window partitioning
+    - 💎 QK normalization for training stability
+    Architecture:
+        1. Temporal Partitioning (audio/video frames → windows)
+        2. Intra-Modal Self-Attention (within each modality)
+        3. Inter-Modal Cross-Attention (audio ↔ video ↔ image)
+        4. Multi-Modal Fusion (adaptive weighted combination)
+        5. Feed-Forward Network (SwiGLU activation)
+        6. Window Merging (reconstruct temporal sequences)
+    """
+    def __init__(
+        self,
+        dim: int = 1024,
+        num_heads: int = 16,
+        window_size: int = 8,
+        shift_size: int = 4,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        drop_path: float = 0.1,
+        use_relative_position_bias: bool = True,
+        fusion_type: str = "adaptive",  # "weighted", "gated", "adaptive"
+        use_shifted_window: bool = True,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size if use_shifted_window else 0
+        self.mlp_ratio = mlp_ratio
+        # =============== Temporal Window Partitioning ===============
+        self.window_partition = TemporalWindowPartition(
+            window_size=window_size,
+            shift_size=self.shift_size,
+        )
+        # =============== Intra-Modal Self-Attention ===============
+        self.norm_audio_self = OmniRMSNorm(dim)
+        self.norm_video_self = OmniRMSNorm(dim)
+        self.norm_image_self = OmniRMSNorm(dim)
+        self.audio_self_attn = WindowCrossAttention(
+            dim=dim,
+            num_heads=num_heads,
+            window_size=window_size,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            use_relative_position_bias=use_relative_position_bias,
+        )
+        self.video_self_attn = WindowCrossAttention(
+            dim=dim,
+            num_heads=num_heads,
+            window_size=window_size,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            use_relative_position_bias=use_relative_position_bias,
+        )
+        self.image_self_attn = WindowCrossAttention(
+            dim=dim,
+            num_heads=num_heads,
+            window_size=window_size,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            use_relative_position_bias=False,  # No temporal bias for static images
+        )
+        # =============== Inter-Modal Cross-Attention ===============
+        # Audio → Video/Image
+        self.norm_audio_cross = OmniRMSNorm(dim)
+        self.audio_to_visual = WindowCrossAttention(
+            dim=dim, num_heads=num_heads, window_size=window_size,
+            qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop,
+        )
+        # Video → Audio/Image
+        self.norm_video_cross = OmniRMSNorm(dim)
+        self.video_to_others = WindowCrossAttention(
+            dim=dim, num_heads=num_heads, window_size=window_size,
+            qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop,
+        )
+        # Image → Audio/Video
+        self.norm_image_cross = OmniRMSNorm(dim)
+        self.image_to_temporal = WindowCrossAttention(
+            dim=dim, num_heads=num_heads, window_size=window_size,
+            qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop,
+        )
+        # =============== Multi-Modal Fusion ===============
+        self.multimodal_fusion = MultiModalFusionLayer(
+            dim=dim,
+            num_modalities=3,
+            fusion_type=fusion_type,
+        )
+        # =============== Feed-Forward Network ===============
+        self.norm_ffn = OmniRMSNorm(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, mlp_hidden_dim, bias=False),
+            nn.GELU(),
+            nn.Dropout(drop),
+            nn.Linear(mlp_hidden_dim, dim, bias=False),
+            nn.Dropout(drop),
+        )
+        # =============== Stochastic Depth (Drop Path) ===============
+        self.drop_path = nn.Identity() if drop_path <= 0. else nn.Dropout(drop_path)
+        # =============== Output Projections ===============
+        self.output_projection = nn.ModuleDict({
+            'audio': nn.Linear(dim, dim),
+            'video': nn.Linear(dim, dim),
+            'image': nn.Linear(dim, dim),
+        })
+    def forward(
+        self,
+        audio_embeds: Optional[torch.Tensor] = None,  # [B, T_audio, L_audio, D]
+        video_embeds: Optional[torch.Tensor] = None,  # [B, T_video, L_video, D]
+        image_embeds: Optional[torch.Tensor] = None,  # [B, L_image, D]
+        attention_mask: Optional[torch.Tensor] = None,
+        return_intermediates: bool = False,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass of the Fancy Multi-Modal Window Attention Block.
+        Args:
+            audio_embeds: Audio embeddings [B, T_audio, L_audio, D]
+                         T_audio: number of audio frames
+                         L_audio: sequence length per frame
+            video_embeds: Video embeddings [B, T_video, L_video, D]
+                         T_video: number of video frames
+                         L_video: sequence length per frame (e.g., patches)
+            image_embeds: Image embeddings [B, L_image, D]
+                         L_image: sequence length (e.g., image patches)
+            attention_mask: Optional attention mask
+            return_intermediates: Whether to return intermediate features
+        Returns:
+            outputs: Dictionary containing processed embeddings for each modality
+                - 'audio': [B, T_audio, L_audio, D]
+                - 'video': [B, T_video, L_video, D]
+                - 'image': [B, L_image, D]
+                - 'fused': [B, L_total, D] (optional)
+        """
+        intermediates = {} if return_intermediates else None
+        # ========== Stage 1: Temporal Window Partitioning ==========
+        partitioned_audio, audio_info = None, None
+        partitioned_video, video_info = None, None
+        if audio_embeds is not None:
+            partitioned_audio, audio_info = self.window_partition.partition(audio_embeds)
+            if return_intermediates:
+                intermediates['audio_windows'] = partitioned_audio
+        if video_embeds is not None:
+            partitioned_video, video_info = self.window_partition.partition(video_embeds)
+            if return_intermediates:
+                intermediates['video_windows'] = partitioned_video
+        # ========== Stage 2: Intra-Modal Self-Attention ==========
+        audio_self_out, video_self_out, image_self_out = None, None, None
+        if audio_embeds is not None:
+            audio_normed = self.norm_audio_self(partitioned_audio)
+            audio_self_out = self.audio_self_attn(audio_normed, audio_normed, audio_normed)
+            audio_self_out = partitioned_audio + self.drop_path(audio_self_out)
+        if video_embeds is not None:
+            video_normed = self.norm_video_self(partitioned_video)
+            video_self_out = self.video_self_attn(video_normed, video_normed, video_normed)
+            video_self_out = partitioned_video + self.drop_path(video_self_out)
+        if image_embeds is not None:
+            image_normed = self.norm_image_self(image_embeds)
+            image_self_out = self.image_self_attn(image_normed, image_normed, image_normed)
+            image_self_out = image_embeds + self.drop_path(image_self_out)
+        # ========== Stage 3: Inter-Modal Cross-Attention ==========
+        audio_cross_out, video_cross_out, image_cross_out = None, None, None
+        # Prepare context (merge windows temporarily for cross-attention)
+        if audio_self_out is not None:
+            audio_merged = self.window_partition.merge(audio_self_out, audio_info)
+        if video_self_out is not None:
+            video_merged = self.window_partition.merge(video_self_out, video_info)
+        # Audio attends to Video and Image
+        if audio_embeds is not None:
+            audio_q = self.norm_audio_cross(audio_merged)
+            # Create key-value context from other modalities
+            kv_list = []
+            if video_embeds is not None:
+                kv_list.append(video_merged)
+            if image_embeds is not None:
+                # Expand image to match temporal dimension
+                B, L_img, D = image_self_out.shape
+                T_audio = audio_merged.shape[1]
+                image_expanded = image_self_out.unsqueeze(1).expand(B, T_audio, L_img, D)
+                kv_list.append(image_expanded)
+            if kv_list:
+                # Concatenate along sequence dimension
+                kv_context = torch.cat([kv.flatten(1, 2) for kv in kv_list], dim=1)
+                kv_context = kv_context.reshape(B, -1, D)
+                audio_cross_out = self.audio_to_visual(
+                    audio_q.flatten(1, 2),
+                    kv_context,
+                    kv_context,
+                    attention_mask
+                )
+                audio_cross_out = audio_cross_out.reshape_as(audio_merged)
+                audio_cross_out = audio_merged + self.drop_path(audio_cross_out)
+            else:
+                audio_cross_out = audio_merged
+        # Video attends to Audio and Image
+        if video_embeds is not None:
+            video_q = self.norm_video_cross(video_merged)
+            kv_list = []
+            if audio_embeds is not None:
+                kv_list.append(audio_merged if audio_cross_out is None else audio_cross_out)
+            if image_embeds is not None:
+                B, L_img, D = image_self_out.shape
+                T_video = video_merged.shape[1]
+                image_expanded = image_self_out.unsqueeze(1).expand(B, T_video, L_img, D)
+                kv_list.append(image_expanded)
+            if kv_list:
+                kv_context = torch.cat([kv.flatten(1, 2) for kv in kv_list], dim=1)
+                kv_context = kv_context.reshape(B, -1, D)
+                video_cross_out = self.video_to_others(
+                    video_q.flatten(1, 2),
+                    kv_context,
+                    kv_context,
+                    attention_mask
+                )
+                video_cross_out = video_cross_out.reshape_as(video_merged)
+                video_cross_out = video_merged + self.drop_path(video_cross_out)
+            else:
+                video_cross_out = video_merged
+        # Image attends to Audio and Video
+        if image_embeds is not None:
+            image_q = self.norm_image_cross(image_self_out)
+            kv_list = []
+            if audio_embeds is not None:
+                # Average pool audio over time for image
+                audio_pooled = (audio_merged if audio_cross_out is None else audio_cross_out).mean(dim=1)
+                kv_list.append(audio_pooled)
+            if video_embeds is not None:
+                # Average pool video over time for image
+                video_pooled = (video_merged if video_cross_out is None else video_cross_out).mean(dim=1)
+                kv_list.append(video_pooled)
+            if kv_list:
+                kv_context = torch.cat(kv_list, dim=1)
+                image_cross_out = self.image_to_temporal(
+                    image_q,
+                    kv_context,
+                    kv_context,
+                    attention_mask
+                )
+                image_cross_out = image_self_out + self.drop_path(image_cross_out)
+            else:
+                image_cross_out = image_self_out
+        # ========== Stage 4: Multi-Modal Fusion ==========
+        # Collect features from all modalities for fusion
+        fusion_features = []
+        if audio_cross_out is not None:
+            audio_flat = audio_cross_out.flatten(1, 2)  # [B, T*L, D]
+            fusion_features.append(audio_flat)
+        if video_cross_out is not None:
+            video_flat = video_cross_out.flatten(1, 2)  # [B, T*L, D]
+            fusion_features.append(video_flat)
+        if image_cross_out is not None:
+            fusion_features.append(image_cross_out)  # [B, L, D]
+        # Pad/align sequence lengths for fusion
+        if len(fusion_features) > 1:
+            max_len = max(f.shape[1] for f in fusion_features)
+            aligned_features = []
+            for feat in fusion_features:
+                if feat.shape[1] < max_len:
+                    pad_len = max_len - feat.shape[1]
+                    feat = F.pad(feat, (0, 0, 0, pad_len))
+                aligned_features.append(feat)
+            # Fuse modalities
+            fused_features = self.multimodal_fusion(aligned_features)
+        else:
+            fused_features = fusion_features[0] if fusion_features else None
+        # ========== Stage 5: Feed-Forward Network ==========
+        if fused_features is not None:
+            fused_normed = self.norm_ffn(fused_features)
+            fused_ffn = self.ffn(fused_normed)
+            fused_features = fused_features + self.drop_path(fused_ffn)
+        # ========== Stage 6: Prepare Outputs ==========
+        outputs = {}
+        # Project back to original shapes
+        if audio_embeds is not None and audio_cross_out is not None:
+            # Partition again for consistency
+            audio_final, _ = self.window_partition.partition(audio_cross_out)
+            audio_final = self.output_projection['audio'](audio_final)
+            audio_final = self.window_partition.merge(audio_final, audio_info)
+            outputs['audio'] = audio_final
+        if video_embeds is not None and video_cross_out is not None:
+            video_final, _ = self.window_partition.partition(video_cross_out)
+            video_final = self.output_projection['video'](video_final)
+            video_final = self.window_partition.merge(video_final, video_info)
+            outputs['video'] = video_final
+        if image_embeds is not None and image_cross_out is not None:
+            image_final = self.output_projection['image'](image_cross_out)
+            outputs['image'] = image_final
+        if fused_features is not None:
+            outputs['fused'] = fused_features
+        if return_intermediates:
+            outputs['intermediates'] = intermediates
+        return outputs
+# -----------------------------------------------------------------------------
+# 7. Optimization Utilities (FP8, Compilation, Mixed Precision)
+# -----------------------------------------------------------------------------
+@dataclass
+class FP8Config:
+    """Configuration for FP8 quantization"""
+    enabled: bool = False
+    margin: int = 0
+    fp8_format: str = "hybrid"  # "e4m3", "e5m2", "hybrid"
+    amax_history_len: int = 1024
+    amax_compute_algo: str = "max"
+@dataclass
+class CompilationConfig:
+    """Configuration for torch.compile"""
+    enabled: bool = False
+    mode: str = "reduce-overhead"  # "default", "reduce-overhead", "max-autotune"
+    fullgraph: bool = False
+    dynamic: bool = True
+    backend: str = "inductor"
+@dataclass
+class MixedPrecisionConfig:
+    """Configuration for mixed precision training/inference"""
+    enabled: bool = True
+    dtype: str = "bfloat16"  # "float16", "bfloat16"
+    use_amp: bool = True
+class ModelOptimizer:
+    """
+    Unified model optimizer supporting FP8 quantization, torch.compile,
+    and mixed precision inference.
+    """
+    def __init__(
+        self,
+        fp8_config: Optional[FP8Config] = None,
+        compilation_config: Optional[CompilationConfig] = None,
+        mixed_precision_config: Optional[MixedPrecisionConfig] = None,
+    ):
+        self.fp8_config = fp8_config or FP8Config()
+        self.compilation_config = compilation_config or CompilationConfig()
+        self.mixed_precision_config = mixed_precision_config or MixedPrecisionConfig()
+        # Setup mixed precision
+        self._setup_mixed_precision()
+    def _setup_mixed_precision(self):
+        """Setup mixed precision context"""
+        if self.mixed_precision_config.enabled:
+            dtype_map = {
+                "float16": torch.float16,
+                "bfloat16": torch.bfloat16,
+            }
+            self.dtype = dtype_map.get(self.mixed_precision_config.dtype, torch.bfloat16)
+        else:
+            self.dtype = torch.float32
+    @contextmanager
+    def autocast_context(self):
+        """Context manager for automatic mixed precision"""
+        if self.mixed_precision_config.enabled and self.mixed_precision_config.use_amp:
+            with torch.autocast(device_type='cuda', dtype=self.dtype):
+                yield
+        else:
+            yield
+    def _compile_model(self, model: nn.Module) -> nn.Module:
+        """Compile model using torch.compile"""
+        if not self.compilation_config.enabled or not HAS_TORCH_COMPILE:
+            return model
+        return torch.compile(
+            model,
+            mode=self.compilation_config.mode,
+            fullgraph=self.compilation_config.fullgraph,
+            dynamic=self.compilation_config.dynamic,
+            backend=self.compilation_config.backend,
+        )
+    def _quantize_model_fp8(self, model: nn.Module) -> nn.Module:
+        """Apply FP8 quantization using Transformer Engine"""
+        if not self.fp8_config.enabled or not HAS_TRANSFORMER_ENGINE:
+            return model
+        # Convert compatible layers to FP8
+        for name, module in model.named_modules():
+            if isinstance(module, nn.Linear):
+                # Replace with TE FP8 Linear
+                fp8_linear = te.Linear(
+                    module.in_features,
+                    module.out_features,
+                    bias=module.bias is not None,
+                )
+                # Copy weights
+                fp8_linear.weight.data.copy_(module.weight.data)
+                if module.bias is not None:
+                    fp8_linear.bias.data.copy_(module.bias.data)
+                # Replace module
+                parent_name = '.'.join(name.split('.')[:-1])
+                child_name = name.split('.')[-1]
+                if parent_name:
+                    parent = dict(model.named_modules())[parent_name]
+                    setattr(parent, child_name, fp8_linear)
+        return model
+    def optimize_model(
+        self,
+        model: nn.Module,
+        apply_compilation: bool = True,
+        apply_quantization: bool = True,
+        apply_mixed_precision: bool = True,
+    ) -> nn.Module:
+        """
+        Apply all optimizations to model.
+        Args:
+            model: Model to optimize
+            apply_compilation: Whether to compile with torch.compile
+            apply_quantization: Whether to apply FP8 quantization
+            apply_mixed_precision: Whether to convert to mixed precision dtype
+        Returns:
+            Optimized model
+        """
+        # Apply FP8 quantization first
+        if apply_quantization and self.fp8_config.enabled:
+            model = self._quantize_model_fp8(model)
+        # Convert to mixed precision dtype
+        if apply_mixed_precision and self.mixed_precision_config.enabled:
+            model = model.to(dtype=self.dtype)
+        # Compile model last
+        if apply_compilation and self.compilation_config.enabled:
+            model = self._compile_model(model)
+        return model
+@contextmanager
+def optimized_inference_mode(
+    enable_cudnn_benchmark: bool = True,
+    enable_tf32: bool = True,
+    enable_flash_sdp: bool = True,
+):
+    """
+    Context manager for optimized inference with various PyTorch optimizations.
+    Args:
+        enable_cudnn_benchmark: Enable cuDNN autotuner
+        enable_tf32: Enable TF32 for faster matmul on Ampere+ GPUs
+        enable_flash_sdp: Enable Flash Attention in scaled_dot_product_attention
+    """
+    # Save original states
+    orig_benchmark = torch.backends.cudnn.benchmark
+    orig_tf32_matmul = torch.backends.cuda.matmul.allow_tf32
+    orig_tf32_cudnn = torch.backends.cudnn.allow_tf32
+    orig_sdp_flash = torch.backends.cuda.flash_sdp_enabled()
+    try:
+        # Enable optimizations
+        torch.backends.cudnn.benchmark = enable_cudnn_benchmark
+        torch.backends.cuda.matmul.allow_tf32 = enable_tf32
+        torch.backends.cudnn.allow_tf32 = enable_tf32
+        if enable_flash_sdp:
+            torch.backends.cuda.enable_flash_sdp(True)
+        yield
+    finally:
+        # Restore original states
+        torch.backends.cudnn.benchmark = orig_benchmark
+        torch.backends.cuda.matmul.allow_tf32 = orig_tf32_matmul
+        torch.backends.cudnn.allow_tf32 = orig_tf32_cudnn
+        torch.backends.cuda.enable_flash_sdp(orig_sdp_flash)