mispeech
/

midashenglm-7b-0804-fp32

@@ -19,44 +19,6 @@ from transformers.models.qwen2_5_omni.modeling_qwen2_5_omni import (
 from .configuration_midashenglm import DashengConfig, MiAudioLLMHFConfig
-# The functions `drop_path` and the module `DropPath` are taken from timm
-def drop_path(
-    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
-):
-    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
-    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
-    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
-    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
-    'survival rate' as the argument.
-    """
-    if drop_prob == 0.0 or not training:
-        return x
-    keep_prob = 1 - drop_prob
-    shape = (x.shape[0],) + (1,) * (
-        x.ndim - 1
-    )  # work with diff dim tensors, not just 2D ConvNets
-    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
-    if keep_prob > 0.0 and scale_by_keep:
-        random_tensor.div_(keep_prob)
-    return x * random_tensor
-class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
-    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-        self.scale_by_keep = scale_by_keep
-    def forward(self, x):
-        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
-    def extra_repr(self):
-        return f"drop_prob={round(self.drop_prob, 3):0.3f}"
 def to_2tuple(x: Any) -> Tuple[Any, Any]:
     if isinstance(x, collections.abc.Iterable):
         return x
@@ -228,7 +190,6 @@ class Block(nn.Module):
         self.ls1 = (
             LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
         )
-        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
         self.norm2 = norm_layer(dim)
         self.mlp = Mlp(
@@ -240,12 +201,11 @@ class Block(nn.Module):
         self.ls2 = (
             LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
         )
-        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
     # Kwargs usually has a mask parameter that is passed to Attention
     def forward(self, x, **kwargs):
-        x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x), **kwargs)))
-        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
         return x

 from .configuration_midashenglm import DashengConfig, MiAudioLLMHFConfig
 def to_2tuple(x: Any) -> Tuple[Any, Any]:
     if isinstance(x, collections.abc.Iterable):
         return x
         self.ls1 = (
             LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
         )
         self.norm2 = norm_layer(dim)
         self.mlp = Mlp(
         self.ls2 = (
             LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
         )
     # Kwargs usually has a mask parameter that is passed to Attention
     def forward(self, x, **kwargs):
+        x = x + self.ls1(self.attn(self.norm1(x), **kwargs))
+        x = x + self.ls2(self.mlp(self.norm2(x)))
         return x