zhoukz commited on
Commit
89cff4d
·
1 Parent(s): c9ab6b8

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. modeling_midashenglm.py +2 -42
modeling_midashenglm.py CHANGED
@@ -19,44 +19,6 @@ from transformers.models.qwen2_5_omni.modeling_qwen2_5_omni import (
19
  from .configuration_midashenglm import DashengConfig, MiAudioLLMHFConfig
20
 
21
 
22
- # The functions `drop_path` and the module `DropPath` are taken from timm
23
- def drop_path(
24
- x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
25
- ):
26
- """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
27
- This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
28
- the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
29
- See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
30
- changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
31
- 'survival rate' as the argument.
32
- """
33
- if drop_prob == 0.0 or not training:
34
- return x
35
- keep_prob = 1 - drop_prob
36
- shape = (x.shape[0],) + (1,) * (
37
- x.ndim - 1
38
- ) # work with diff dim tensors, not just 2D ConvNets
39
- random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
40
- if keep_prob > 0.0 and scale_by_keep:
41
- random_tensor.div_(keep_prob)
42
- return x * random_tensor
43
-
44
-
45
- class DropPath(nn.Module):
46
- """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
47
-
48
- def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
49
- super(DropPath, self).__init__()
50
- self.drop_prob = drop_prob
51
- self.scale_by_keep = scale_by_keep
52
-
53
- def forward(self, x):
54
- return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
55
-
56
- def extra_repr(self):
57
- return f"drop_prob={round(self.drop_prob, 3):0.3f}"
58
-
59
-
60
  def to_2tuple(x: Any) -> Tuple[Any, Any]:
61
  if isinstance(x, collections.abc.Iterable):
62
  return x
@@ -228,7 +190,6 @@ class Block(nn.Module):
228
  self.ls1 = (
229
  LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
230
  )
231
- self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
232
 
233
  self.norm2 = norm_layer(dim)
234
  self.mlp = Mlp(
@@ -240,12 +201,11 @@ class Block(nn.Module):
240
  self.ls2 = (
241
  LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
242
  )
243
- self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
244
 
245
  # Kwargs usually has a mask parameter that is passed to Attention
246
  def forward(self, x, **kwargs):
247
- x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x), **kwargs)))
248
- x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
249
  return x
250
 
251
 
 
19
  from .configuration_midashenglm import DashengConfig, MiAudioLLMHFConfig
20
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  def to_2tuple(x: Any) -> Tuple[Any, Any]:
23
  if isinstance(x, collections.abc.Iterable):
24
  return x
 
190
  self.ls1 = (
191
  LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
192
  )
 
193
 
194
  self.norm2 = norm_layer(dim)
195
  self.mlp = Mlp(
 
201
  self.ls2 = (
202
  LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
203
  )
 
204
 
205
  # Kwargs usually has a mask parameter that is passed to Attention
206
  def forward(self, x, **kwargs):
207
+ x = x + self.ls1(self.attn(self.norm1(x), **kwargs))
208
+ x = x + self.ls2(self.mlp(self.norm2(x)))
209
  return x
210
 
211