update for v5
Browse files- ultravox_model.py +38 -17
- ultravox_pipeline.py +1 -1
- ultravox_processing.py +3 -3
ultravox_model.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import logging
|
| 2 |
import re
|
|
|
|
| 3 |
from typing import Any, Dict, Generator, Optional, Set, Tuple, TypeVar, Union
|
| 4 |
|
| 5 |
import accelerate
|
|
@@ -784,6 +785,8 @@ class ModifiedWhisperEncoder(
|
|
| 784 |
def __init__(self, config: transformers.WhisperConfig):
|
| 785 |
super().__init__(config)
|
| 786 |
self.config.is_decoder = False
|
|
|
|
|
|
|
| 787 |
|
| 788 |
@property
|
| 789 |
def max_context_length(self):
|
|
@@ -828,10 +831,10 @@ class ModifiedWhisperEncoder(
|
|
| 828 |
self,
|
| 829 |
input_features,
|
| 830 |
audio_len=None,
|
| 831 |
-
head_mask=None,
|
| 832 |
output_attentions=None,
|
| 833 |
output_hidden_states=None,
|
| 834 |
return_dict=None,
|
|
|
|
| 835 |
):
|
| 836 |
expected_seq_length = self.max_context_length
|
| 837 |
if input_features.shape[-1] > expected_seq_length:
|
|
@@ -898,7 +901,8 @@ class ModifiedWhisperEncoder(
|
|
| 898 |
attention_mask = attention_mask.to(hidden_states.dtype)
|
| 899 |
|
| 900 |
# check if head_mask has a correct number of layers specified if desired
|
| 901 |
-
if head_mask is not None:
|
|
|
|
| 902 |
assert head_mask.size()[0] == (
|
| 903 |
len(self.layers)
|
| 904 |
), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
|
|
@@ -917,22 +921,39 @@ class ModifiedWhisperEncoder(
|
|
| 917 |
layer_outputs = (None, None)
|
| 918 |
else:
|
| 919 |
if self.gradient_checkpointing and self.training:
|
| 920 |
-
|
| 921 |
-
|
| 922 |
-
|
| 923 |
-
|
| 924 |
-
|
| 925 |
-
|
| 926 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 927 |
else:
|
| 928 |
-
|
| 929 |
-
|
| 930 |
-
|
| 931 |
-
|
| 932 |
-
|
| 933 |
-
|
| 934 |
-
|
| 935 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 936 |
|
| 937 |
hidden_states = layer_outputs[0]
|
| 938 |
|
|
|
|
| 1 |
import logging
|
| 2 |
import re
|
| 3 |
+
import inspect
|
| 4 |
from typing import Any, Dict, Generator, Optional, Set, Tuple, TypeVar, Union
|
| 5 |
|
| 6 |
import accelerate
|
|
|
|
| 785 |
def __init__(self, config: transformers.WhisperConfig):
|
| 786 |
super().__init__(config)
|
| 787 |
self.config.is_decoder = False
|
| 788 |
+
sig = inspect.signature(self.layers[0].forward)
|
| 789 |
+
self.use_layer_head_mask = "layer_head_mask" in sig.parameters
|
| 790 |
|
| 791 |
@property
|
| 792 |
def max_context_length(self):
|
|
|
|
| 831 |
self,
|
| 832 |
input_features,
|
| 833 |
audio_len=None,
|
|
|
|
| 834 |
output_attentions=None,
|
| 835 |
output_hidden_states=None,
|
| 836 |
return_dict=None,
|
| 837 |
+
**kwargs,
|
| 838 |
):
|
| 839 |
expected_seq_length = self.max_context_length
|
| 840 |
if input_features.shape[-1] > expected_seq_length:
|
|
|
|
| 901 |
attention_mask = attention_mask.to(hidden_states.dtype)
|
| 902 |
|
| 903 |
# check if head_mask has a correct number of layers specified if desired
|
| 904 |
+
if self.use_layer_head_mask and kwargs.get("head_mask") is not None:
|
| 905 |
+
head_mask = kwargs.get("head_mask")
|
| 906 |
assert head_mask.size()[0] == (
|
| 907 |
len(self.layers)
|
| 908 |
), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
|
|
|
|
| 921 |
layer_outputs = (None, None)
|
| 922 |
else:
|
| 923 |
if self.gradient_checkpointing and self.training:
|
| 924 |
+
if self.use_layer_head_mask:
|
| 925 |
+
head_mask = kwargs.get("head_mask")
|
| 926 |
+
layer_outputs = self._gradient_checkpointing_func(
|
| 927 |
+
encoder_layer.__call__,
|
| 928 |
+
hidden_states,
|
| 929 |
+
attention_mask,
|
| 930 |
+
(head_mask[idx] if head_mask is not None else None),
|
| 931 |
+
output_attentions,
|
| 932 |
+
)
|
| 933 |
+
else:
|
| 934 |
+
layer_outputs = self._gradient_checkpointing_func(
|
| 935 |
+
encoder_layer.__call__,
|
| 936 |
+
hidden_states,
|
| 937 |
+
attention_mask,
|
| 938 |
+
output_attentions,
|
| 939 |
+
)
|
| 940 |
else:
|
| 941 |
+
if self.use_layer_head_mask:
|
| 942 |
+
head_mask = kwargs.get("head_mask")
|
| 943 |
+
layer_outputs = encoder_layer(
|
| 944 |
+
hidden_states,
|
| 945 |
+
attention_mask,
|
| 946 |
+
layer_head_mask=(
|
| 947 |
+
head_mask[idx] if head_mask is not None else None
|
| 948 |
+
),
|
| 949 |
+
output_attentions=output_attentions,
|
| 950 |
+
)
|
| 951 |
+
else:
|
| 952 |
+
layer_outputs = encoder_layer(
|
| 953 |
+
hidden_states,
|
| 954 |
+
attention_mask,
|
| 955 |
+
output_attentions=output_attentions,
|
| 956 |
+
)
|
| 957 |
|
| 958 |
hidden_states = layer_outputs[0]
|
| 959 |
|
ultravox_pipeline.py
CHANGED
|
@@ -29,7 +29,7 @@ class UltravoxPipeline(transformers.Pipeline):
|
|
| 29 |
)
|
| 30 |
|
| 31 |
if audio_processor is None:
|
| 32 |
-
audio_processor = transformers.
|
| 33 |
model.config.audio_model_id or model.config.audio_config._name_or_path
|
| 34 |
)
|
| 35 |
|
|
|
|
| 29 |
)
|
| 30 |
|
| 31 |
if audio_processor is None:
|
| 32 |
+
audio_processor = transformers.AutoFeatureExtractor.from_pretrained(
|
| 33 |
model.config.audio_model_id or model.config.audio_config._name_or_path
|
| 34 |
)
|
| 35 |
|
ultravox_processing.py
CHANGED
|
@@ -74,7 +74,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
|
|
| 74 |
"""
|
| 75 |
|
| 76 |
attributes = ["audio_processor", "tokenizer"]
|
| 77 |
-
audio_processor_class = ("
|
| 78 |
tokenizer_class = (
|
| 79 |
"PreTrainedTokenizer",
|
| 80 |
"PreTrainedTokenizerFast",
|
|
@@ -124,7 +124,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
|
|
| 124 |
config: UltravoxConfig = transformers.AutoConfig.from_pretrained(
|
| 125 |
pretrained_model_name_or_path, **kwargs
|
| 126 |
)
|
| 127 |
-
audio_processor = transformers.
|
| 128 |
config.audio_model_id
|
| 129 |
or config.audio_config._name_or_path
|
| 130 |
or "openai/whisper-tiny"
|
|
@@ -273,7 +273,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
|
|
| 273 |
audios = [x.numpy() if isinstance(x, torch.Tensor) else x for x in audios]
|
| 274 |
|
| 275 |
# Pad out each audio to at least 2 hops (the minimum required by the processor).
|
| 276 |
-
hop_length = self.audio_processor.
|
| 277 |
audios = [
|
| 278 |
(
|
| 279 |
np.pad(x, (0, 2 * hop_length - len(x)), mode="constant")
|
|
|
|
| 74 |
"""
|
| 75 |
|
| 76 |
attributes = ["audio_processor", "tokenizer"]
|
| 77 |
+
audio_processor_class = ("WhisperFeatureExtractor",)
|
| 78 |
tokenizer_class = (
|
| 79 |
"PreTrainedTokenizer",
|
| 80 |
"PreTrainedTokenizerFast",
|
|
|
|
| 124 |
config: UltravoxConfig = transformers.AutoConfig.from_pretrained(
|
| 125 |
pretrained_model_name_or_path, **kwargs
|
| 126 |
)
|
| 127 |
+
audio_processor = transformers.AutoFeatureExtractor.from_pretrained(
|
| 128 |
config.audio_model_id
|
| 129 |
or config.audio_config._name_or_path
|
| 130 |
or "openai/whisper-tiny"
|
|
|
|
| 273 |
audios = [x.numpy() if isinstance(x, torch.Tensor) else x for x in audios]
|
| 274 |
|
| 275 |
# Pad out each audio to at least 2 hops (the minimum required by the processor).
|
| 276 |
+
hop_length = self.audio_processor.hop_length
|
| 277 |
audios = [
|
| 278 |
(
|
| 279 |
np.pad(x, (0, 2 * hop_length - len(x)), mode="constant")
|