eustlb HF Staff commited on
Commit
019a228
·
1 Parent(s): 20ae7cc

update for v5

Browse files
ultravox_model.py CHANGED
@@ -1,5 +1,6 @@
1
  import logging
2
  import re
 
3
  from typing import Any, Dict, Generator, Optional, Set, Tuple, TypeVar, Union
4
 
5
  import accelerate
@@ -784,6 +785,8 @@ class ModifiedWhisperEncoder(
784
  def __init__(self, config: transformers.WhisperConfig):
785
  super().__init__(config)
786
  self.config.is_decoder = False
 
 
787
 
788
  @property
789
  def max_context_length(self):
@@ -828,10 +831,10 @@ class ModifiedWhisperEncoder(
828
  self,
829
  input_features,
830
  audio_len=None,
831
- head_mask=None,
832
  output_attentions=None,
833
  output_hidden_states=None,
834
  return_dict=None,
 
835
  ):
836
  expected_seq_length = self.max_context_length
837
  if input_features.shape[-1] > expected_seq_length:
@@ -898,7 +901,8 @@ class ModifiedWhisperEncoder(
898
  attention_mask = attention_mask.to(hidden_states.dtype)
899
 
900
  # check if head_mask has a correct number of layers specified if desired
901
- if head_mask is not None:
 
902
  assert head_mask.size()[0] == (
903
  len(self.layers)
904
  ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
@@ -917,22 +921,39 @@ class ModifiedWhisperEncoder(
917
  layer_outputs = (None, None)
918
  else:
919
  if self.gradient_checkpointing and self.training:
920
- layer_outputs = self._gradient_checkpointing_func(
921
- encoder_layer.__call__,
922
- hidden_states,
923
- attention_mask,
924
- (head_mask[idx] if head_mask is not None else None),
925
- output_attentions,
926
- )
 
 
 
 
 
 
 
 
 
927
  else:
928
- layer_outputs = encoder_layer(
929
- hidden_states,
930
- attention_mask,
931
- layer_head_mask=(
932
- head_mask[idx] if head_mask is not None else None
933
- ),
934
- output_attentions=output_attentions,
935
- )
 
 
 
 
 
 
 
 
936
 
937
  hidden_states = layer_outputs[0]
938
 
 
1
  import logging
2
  import re
3
+ import inspect
4
  from typing import Any, Dict, Generator, Optional, Set, Tuple, TypeVar, Union
5
 
6
  import accelerate
 
785
  def __init__(self, config: transformers.WhisperConfig):
786
  super().__init__(config)
787
  self.config.is_decoder = False
788
+ sig = inspect.signature(self.layers[0].forward)
789
+ self.use_layer_head_mask = "layer_head_mask" in sig.parameters
790
 
791
  @property
792
  def max_context_length(self):
 
831
  self,
832
  input_features,
833
  audio_len=None,
 
834
  output_attentions=None,
835
  output_hidden_states=None,
836
  return_dict=None,
837
+ **kwargs,
838
  ):
839
  expected_seq_length = self.max_context_length
840
  if input_features.shape[-1] > expected_seq_length:
 
901
  attention_mask = attention_mask.to(hidden_states.dtype)
902
 
903
  # check if head_mask has a correct number of layers specified if desired
904
+ if self.use_layer_head_mask and kwargs.get("head_mask") is not None:
905
+ head_mask = kwargs.get("head_mask")
906
  assert head_mask.size()[0] == (
907
  len(self.layers)
908
  ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
 
921
  layer_outputs = (None, None)
922
  else:
923
  if self.gradient_checkpointing and self.training:
924
+ if self.use_layer_head_mask:
925
+ head_mask = kwargs.get("head_mask")
926
+ layer_outputs = self._gradient_checkpointing_func(
927
+ encoder_layer.__call__,
928
+ hidden_states,
929
+ attention_mask,
930
+ (head_mask[idx] if head_mask is not None else None),
931
+ output_attentions,
932
+ )
933
+ else:
934
+ layer_outputs = self._gradient_checkpointing_func(
935
+ encoder_layer.__call__,
936
+ hidden_states,
937
+ attention_mask,
938
+ output_attentions,
939
+ )
940
  else:
941
+ if self.use_layer_head_mask:
942
+ head_mask = kwargs.get("head_mask")
943
+ layer_outputs = encoder_layer(
944
+ hidden_states,
945
+ attention_mask,
946
+ layer_head_mask=(
947
+ head_mask[idx] if head_mask is not None else None
948
+ ),
949
+ output_attentions=output_attentions,
950
+ )
951
+ else:
952
+ layer_outputs = encoder_layer(
953
+ hidden_states,
954
+ attention_mask,
955
+ output_attentions=output_attentions,
956
+ )
957
 
958
  hidden_states = layer_outputs[0]
959
 
ultravox_pipeline.py CHANGED
@@ -29,7 +29,7 @@ class UltravoxPipeline(transformers.Pipeline):
29
  )
30
 
31
  if audio_processor is None:
32
- audio_processor = transformers.AutoProcessor.from_pretrained(
33
  model.config.audio_model_id or model.config.audio_config._name_or_path
34
  )
35
 
 
29
  )
30
 
31
  if audio_processor is None:
32
+ audio_processor = transformers.AutoFeatureExtractor.from_pretrained(
33
  model.config.audio_model_id or model.config.audio_config._name_or_path
34
  )
35
 
ultravox_processing.py CHANGED
@@ -74,7 +74,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
74
  """
75
 
76
  attributes = ["audio_processor", "tokenizer"]
77
- audio_processor_class = ("WhisperProcessor",)
78
  tokenizer_class = (
79
  "PreTrainedTokenizer",
80
  "PreTrainedTokenizerFast",
@@ -124,7 +124,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
124
  config: UltravoxConfig = transformers.AutoConfig.from_pretrained(
125
  pretrained_model_name_or_path, **kwargs
126
  )
127
- audio_processor = transformers.AutoProcessor.from_pretrained(
128
  config.audio_model_id
129
  or config.audio_config._name_or_path
130
  or "openai/whisper-tiny"
@@ -273,7 +273,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
273
  audios = [x.numpy() if isinstance(x, torch.Tensor) else x for x in audios]
274
 
275
  # Pad out each audio to at least 2 hops (the minimum required by the processor).
276
- hop_length = self.audio_processor.feature_extractor.hop_length
277
  audios = [
278
  (
279
  np.pad(x, (0, 2 * hop_length - len(x)), mode="constant")
 
74
  """
75
 
76
  attributes = ["audio_processor", "tokenizer"]
77
+ audio_processor_class = ("WhisperFeatureExtractor",)
78
  tokenizer_class = (
79
  "PreTrainedTokenizer",
80
  "PreTrainedTokenizerFast",
 
124
  config: UltravoxConfig = transformers.AutoConfig.from_pretrained(
125
  pretrained_model_name_or_path, **kwargs
126
  )
127
+ audio_processor = transformers.AutoFeatureExtractor.from_pretrained(
128
  config.audio_model_id
129
  or config.audio_config._name_or_path
130
  or "openai/whisper-tiny"
 
273
  audios = [x.numpy() if isinstance(x, torch.Tensor) else x for x in audios]
274
 
275
  # Pad out each audio to at least 2 hops (the minimum required by the processor).
276
+ hop_length = self.audio_processor.hop_length
277
  audios = [
278
  (
279
  np.pad(x, (0, 2 * hop_length - len(x)), mode="constant")