update for v5

Browse files

Files changed (3) hide show

ultravox_model.py +38 -17
ultravox_pipeline.py +1 -1
ultravox_processing.py +3 -3

ultravox_model.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import logging
 import re
 from typing import Any, Dict, Generator, Optional, Set, Tuple, TypeVar, Union
 import accelerate
@@ -784,6 +785,8 @@ class ModifiedWhisperEncoder(
     def __init__(self, config: transformers.WhisperConfig):
         super().__init__(config)
         self.config.is_decoder = False
     @property
     def max_context_length(self):
@@ -828,10 +831,10 @@ class ModifiedWhisperEncoder(
         self,
         input_features,
         audio_len=None,
-        head_mask=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
     ):
         expected_seq_length = self.max_context_length
         if input_features.shape[-1] > expected_seq_length:
@@ -898,7 +901,8 @@ class ModifiedWhisperEncoder(
             attention_mask = attention_mask.to(hidden_states.dtype)
         # check if head_mask has a correct number of layers specified if desired
-        if head_mask is not None:
             assert head_mask.size()[0] == (
                 len(self.layers)
             ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
@@ -917,22 +921,39 @@ class ModifiedWhisperEncoder(
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
-                    layer_outputs = self._gradient_checkpointing_func(
-                        encoder_layer.__call__,
-                        hidden_states,
-                        attention_mask,
-                        (head_mask[idx] if head_mask is not None else None),
-                        output_attentions,
-                    )
                 else:
-                    layer_outputs = encoder_layer(
-                        hidden_states,
-                        attention_mask,
-                        layer_head_mask=(
-                            head_mask[idx] if head_mask is not None else None
-                        ),
-                        output_attentions=output_attentions,
-                    )
                 hidden_states = layer_outputs[0]

 import logging
 import re
+import inspect
 from typing import Any, Dict, Generator, Optional, Set, Tuple, TypeVar, Union
 import accelerate
     def __init__(self, config: transformers.WhisperConfig):
         super().__init__(config)
         self.config.is_decoder = False
+        sig = inspect.signature(self.layers[0].forward)
+        self.use_layer_head_mask = "layer_head_mask" in sig.parameters
     @property
     def max_context_length(self):
         self,
         input_features,
         audio_len=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
+        **kwargs,
     ):
         expected_seq_length = self.max_context_length
         if input_features.shape[-1] > expected_seq_length:
             attention_mask = attention_mask.to(hidden_states.dtype)
         # check if head_mask has a correct number of layers specified if desired
+        if self.use_layer_head_mask and kwargs.get("head_mask") is not None:
+            head_mask = kwargs.get("head_mask")
             assert head_mask.size()[0] == (
                 len(self.layers)
             ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
                 layer_outputs = (None, None)
             else:
                 if self.gradient_checkpointing and self.training:
+                    if self.use_layer_head_mask:
+                        head_mask = kwargs.get("head_mask")
+                        layer_outputs = self._gradient_checkpointing_func(
+                            encoder_layer.__call__,
+                            hidden_states,
+                            attention_mask,
+                            (head_mask[idx] if head_mask is not None else None),
+                            output_attentions,
+                        )
+                    else:
+                        layer_outputs = self._gradient_checkpointing_func(
+                            encoder_layer.__call__,
+                            hidden_states,
+                            attention_mask,
+                            output_attentions,
+                        )
                 else:
+                    if self.use_layer_head_mask:
+                        head_mask = kwargs.get("head_mask")
+                        layer_outputs = encoder_layer(
+                            hidden_states,
+                            attention_mask,
+                            layer_head_mask=(
+                                head_mask[idx] if head_mask is not None else None
+                            ),
+                            output_attentions=output_attentions,
+                        )
+                    else:
+                        layer_outputs = encoder_layer(
+                            hidden_states,
+                            attention_mask,
+                            output_attentions=output_attentions,
+                        )
                 hidden_states = layer_outputs[0]

ultravox_pipeline.py CHANGED Viewed

@@ -29,7 +29,7 @@ class UltravoxPipeline(transformers.Pipeline):
                 )
         if audio_processor is None:
-            audio_processor = transformers.AutoProcessor.from_pretrained(
                 model.config.audio_model_id or model.config.audio_config._name_or_path
             )

                 )
         if audio_processor is None:
+            audio_processor = transformers.AutoFeatureExtractor.from_pretrained(
                 model.config.audio_model_id or model.config.audio_config._name_or_path
             )

ultravox_processing.py CHANGED Viewed

@@ -74,7 +74,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
     """
     attributes = ["audio_processor", "tokenizer"]
-    audio_processor_class = ("WhisperProcessor",)
     tokenizer_class = (
         "PreTrainedTokenizer",
         "PreTrainedTokenizerFast",
@@ -124,7 +124,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
         config: UltravoxConfig = transformers.AutoConfig.from_pretrained(
             pretrained_model_name_or_path, **kwargs
         )
-        audio_processor = transformers.AutoProcessor.from_pretrained(
             config.audio_model_id
             or config.audio_config._name_or_path
             or "openai/whisper-tiny"
@@ -273,7 +273,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
             audios = [x.numpy() if isinstance(x, torch.Tensor) else x for x in audios]
             # Pad out each audio to at least 2 hops (the minimum required by the processor).
-            hop_length = self.audio_processor.feature_extractor.hop_length
             audios = [
                 (
                     np.pad(x, (0, 2 * hop_length - len(x)), mode="constant")

     """
     attributes = ["audio_processor", "tokenizer"]
+    audio_processor_class = ("WhisperFeatureExtractor",)
     tokenizer_class = (
         "PreTrainedTokenizer",
         "PreTrainedTokenizerFast",
         config: UltravoxConfig = transformers.AutoConfig.from_pretrained(
             pretrained_model_name_or_path, **kwargs
         )
+        audio_processor = transformers.AutoFeatureExtractor.from_pretrained(
             config.audio_model_id
             or config.audio_config._name_or_path
             or "openai/whisper-tiny"
             audios = [x.numpy() if isinstance(x, torch.Tensor) else x for x in audios]
             # Pad out each audio to at least 2 hops (the minimum required by the processor).
+            hop_length = self.audio_processor.hop_length
             audios = [
                 (
                     np.pad(x, (0, 2 * hop_length - len(x)), mode="constant")