Upload folder using huggingface_hub
Browse files- modeling_midashenglm.py +2 -42
modeling_midashenglm.py
CHANGED
|
@@ -19,44 +19,6 @@ from transformers.models.qwen2_5_omni.modeling_qwen2_5_omni import (
|
|
| 19 |
from .configuration_midashenglm import DashengConfig, MiAudioLLMHFConfig
|
| 20 |
|
| 21 |
|
| 22 |
-
# The functions `drop_path` and the module `DropPath` are taken from timm
|
| 23 |
-
def drop_path(
|
| 24 |
-
x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
|
| 25 |
-
):
|
| 26 |
-
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
| 27 |
-
This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
|
| 28 |
-
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
|
| 29 |
-
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
|
| 30 |
-
changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
|
| 31 |
-
'survival rate' as the argument.
|
| 32 |
-
"""
|
| 33 |
-
if drop_prob == 0.0 or not training:
|
| 34 |
-
return x
|
| 35 |
-
keep_prob = 1 - drop_prob
|
| 36 |
-
shape = (x.shape[0],) + (1,) * (
|
| 37 |
-
x.ndim - 1
|
| 38 |
-
) # work with diff dim tensors, not just 2D ConvNets
|
| 39 |
-
random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
|
| 40 |
-
if keep_prob > 0.0 and scale_by_keep:
|
| 41 |
-
random_tensor.div_(keep_prob)
|
| 42 |
-
return x * random_tensor
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
class DropPath(nn.Module):
|
| 46 |
-
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
|
| 47 |
-
|
| 48 |
-
def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
|
| 49 |
-
super(DropPath, self).__init__()
|
| 50 |
-
self.drop_prob = drop_prob
|
| 51 |
-
self.scale_by_keep = scale_by_keep
|
| 52 |
-
|
| 53 |
-
def forward(self, x):
|
| 54 |
-
return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
|
| 55 |
-
|
| 56 |
-
def extra_repr(self):
|
| 57 |
-
return f"drop_prob={round(self.drop_prob, 3):0.3f}"
|
| 58 |
-
|
| 59 |
-
|
| 60 |
def to_2tuple(x: Any) -> Tuple[Any, Any]:
|
| 61 |
if isinstance(x, collections.abc.Iterable):
|
| 62 |
return x
|
|
@@ -228,7 +190,6 @@ class Block(nn.Module):
|
|
| 228 |
self.ls1 = (
|
| 229 |
LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
|
| 230 |
)
|
| 231 |
-
self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
|
| 232 |
|
| 233 |
self.norm2 = norm_layer(dim)
|
| 234 |
self.mlp = Mlp(
|
|
@@ -240,12 +201,11 @@ class Block(nn.Module):
|
|
| 240 |
self.ls2 = (
|
| 241 |
LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
|
| 242 |
)
|
| 243 |
-
self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
|
| 244 |
|
| 245 |
# Kwargs usually has a mask parameter that is passed to Attention
|
| 246 |
def forward(self, x, **kwargs):
|
| 247 |
-
x = x + self.
|
| 248 |
-
x = x + self.
|
| 249 |
return x
|
| 250 |
|
| 251 |
|
|
|
|
| 19 |
from .configuration_midashenglm import DashengConfig, MiAudioLLMHFConfig
|
| 20 |
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
def to_2tuple(x: Any) -> Tuple[Any, Any]:
|
| 23 |
if isinstance(x, collections.abc.Iterable):
|
| 24 |
return x
|
|
|
|
| 190 |
self.ls1 = (
|
| 191 |
LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
|
| 192 |
)
|
|
|
|
| 193 |
|
| 194 |
self.norm2 = norm_layer(dim)
|
| 195 |
self.mlp = Mlp(
|
|
|
|
| 201 |
self.ls2 = (
|
| 202 |
LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
|
| 203 |
)
|
|
|
|
| 204 |
|
| 205 |
# Kwargs usually has a mask parameter that is passed to Attention
|
| 206 |
def forward(self, x, **kwargs):
|
| 207 |
+
x = x + self.ls1(self.attn(self.norm1(x), **kwargs))
|
| 208 |
+
x = x + self.ls2(self.mlp(self.norm2(x)))
|
| 209 |
return x
|
| 210 |
|
| 211 |
|