Upload folder using huggingface_hub
Browse files- README.md +14 -2
- chat_template.jinja +1 -1
- config.json +0 -1
- modeling_midashenglm.py +57 -77
README.md
CHANGED
|
@@ -1,17 +1,29 @@
|
|
| 1 |
---
|
| 2 |
-
license:
|
|
|
|
| 3 |
language:
|
| 4 |
- en
|
| 5 |
- zh
|
|
|
|
| 6 |
pipeline_tag: audio-text-to-text
|
| 7 |
tags:
|
| 8 |
- multimodal
|
| 9 |
- audio-language-model
|
| 10 |
- audio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
base_model:
|
| 12 |
- mispeech/dasheng-0.6B
|
| 13 |
- Qwen/Qwen2.5-Omni-3B
|
| 14 |
base_model_relation: finetune
|
|
|
|
| 15 |
---
|
| 16 |
|
| 17 |
# MiDashengLM
|
|
@@ -118,4 +130,4 @@ base_model_relation: finetune
|
|
| 118 |
|
| 119 |
```bibtex
|
| 120 |
TODO
|
| 121 |
-
```
|
|
|
|
| 1 |
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
# TODO 什么License?
|
| 4 |
language:
|
| 5 |
- en
|
| 6 |
- zh
|
| 7 |
+
# TODO 明确支持的语言
|
| 8 |
pipeline_tag: audio-text-to-text
|
| 9 |
tags:
|
| 10 |
- multimodal
|
| 11 |
- audio-language-model
|
| 12 |
- audio
|
| 13 |
+
# - audio-captioning
|
| 14 |
+
# - audio-classification
|
| 15 |
+
# - audio-generation
|
| 16 |
+
# - audio-question-answering
|
| 17 |
+
# - audio-understanding
|
| 18 |
+
# - chat
|
| 19 |
+
# - speech-recognition
|
| 20 |
+
# - text-to-speech
|
| 21 |
+
# TODO 有什么能力
|
| 22 |
base_model:
|
| 23 |
- mispeech/dasheng-0.6B
|
| 24 |
- Qwen/Qwen2.5-Omni-3B
|
| 25 |
base_model_relation: finetune
|
| 26 |
+
# TODO 检查是否正确
|
| 27 |
---
|
| 28 |
|
| 29 |
# MiDashengLM
|
|
|
|
| 130 |
|
| 131 |
```bibtex
|
| 132 |
TODO
|
| 133 |
+
```
|
chat_template.jinja
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{%- for message in messages -%}
|
| 2 |
{%- if loop.first and message["role"] != "system" -%}
|
| 3 |
-
{{- "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" -}}
|
| 4 |
{%- endif -%}
|
| 5 |
{{- "<|im_start|>" -}}
|
| 6 |
{{- message["role"] -}}
|
|
|
|
| 1 |
{%- for message in messages -%}
|
| 2 |
{%- if loop.first and message["role"] != "system" -%}
|
| 3 |
+
{{- "<|im_start|>system\nYou are a helpful language and speech assistant.<|im_end|>\n" -}}
|
| 4 |
{%- endif -%}
|
| 5 |
{{- "<|im_start|>" -}}
|
| 6 |
{{- message["role"] -}}
|
config.json
CHANGED
|
@@ -36,7 +36,6 @@
|
|
| 36 |
"AutoConfig": "configuration_midashenglm.MiDashengLMConfig",
|
| 37 |
"AutoModelForCausalLM": "modeling_midashenglm.MiDashengLMModel"
|
| 38 |
},
|
| 39 |
-
"lora_target_modules": "all-linear",
|
| 40 |
"model_type": "midashenglm",
|
| 41 |
"subsample_factor": 5,
|
| 42 |
"text_config": {
|
|
|
|
| 36 |
"AutoConfig": "configuration_midashenglm.MiDashengLMConfig",
|
| 37 |
"AutoModelForCausalLM": "modeling_midashenglm.MiDashengLMModel"
|
| 38 |
},
|
|
|
|
| 39 |
"model_type": "midashenglm",
|
| 40 |
"subsample_factor": 5,
|
| 41 |
"text_config": {
|
modeling_midashenglm.py
CHANGED
|
@@ -474,44 +474,22 @@ class MiDashengLMModel(PreTrainedModel):
|
|
| 474 |
|
| 475 |
return encoder_out
|
| 476 |
|
| 477 |
-
def
|
| 478 |
self,
|
| 479 |
-
input_ids: torch.Tensor,
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
) -> torch.Tensor:
|
| 483 |
-
input_embeddings = self.decoder.model.embed_tokens(input_ids)
|
| 484 |
-
if audio_embeddings is not None:
|
| 485 |
-
special_mask = input_ids == audio_token_id
|
| 486 |
-
assert audio_embeddings.shape[1] <= (special_mask.sum(-1)).max(), (
|
| 487 |
-
"Mask and audio embeddings seem to have different sizes: "
|
| 488 |
-
f"{audio_embeddings.shape=}, {special_mask=}, {input_ids=}, "
|
| 489 |
-
f"{audio_embeddings.shape[1]=} vs {(special_mask.sum(-1)).max()=}"
|
| 490 |
-
)
|
| 491 |
-
audio_embeddings = audio_embeddings.to(input_embeddings.dtype)
|
| 492 |
-
|
| 493 |
-
for i in range(len(special_mask)):
|
| 494 |
-
mask = special_mask[i]
|
| 495 |
-
number_of_tokens = mask.sum(-1)
|
| 496 |
-
input_embeddings[i, mask] = audio_embeddings[i, :number_of_tokens]
|
| 497 |
-
return input_embeddings
|
| 498 |
-
else:
|
| 499 |
-
return input_embeddings
|
| 500 |
-
|
| 501 |
-
def forward(
|
| 502 |
-
self,
|
| 503 |
-
input_ids: Optional[Tensor] = None,
|
| 504 |
-
input_values: Optional[Tensor] = None,
|
| 505 |
-
inputs_embeds: Optional[Tensor] = None,
|
| 506 |
audio_length: Optional[Iterable[int]] = None,
|
| 507 |
audio_token_id: Optional[int] = None,
|
| 508 |
-
|
| 509 |
-
):
|
| 510 |
if input_ids is not None:
|
| 511 |
if inputs_embeds is not None:
|
| 512 |
raise ValueError(
|
| 513 |
"Both `inputs_embeds` and `input_ids` are passed. Please pass only one of them."
|
| 514 |
)
|
|
|
|
|
|
|
|
|
|
| 515 |
|
| 516 |
if input_values is not None:
|
| 517 |
if audio_token_id is None:
|
|
@@ -519,25 +497,31 @@ class MiDashengLMModel(PreTrainedModel):
|
|
| 519 |
"If `input_values` is provided, `audio_token_id` must also be provided."
|
| 520 |
)
|
| 521 |
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
)
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
|
|
|
|
|
|
| 533 |
)
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 541 |
else:
|
| 542 |
if inputs_embeds is None:
|
| 543 |
raise ValueError(
|
|
@@ -548,6 +532,24 @@ class MiDashengLMModel(PreTrainedModel):
|
|
| 548 |
"Cannot pass `input_values` when `inputs_embeds` is provided."
|
| 549 |
)
|
| 550 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
return self.decoder(
|
| 552 |
input_ids=None,
|
| 553 |
inputs_embeds=inputs_embeds,
|
|
@@ -563,35 +565,13 @@ class MiDashengLMModel(PreTrainedModel):
|
|
| 563 |
audio_token_id: Optional[int] = None,
|
| 564 |
**kwargs,
|
| 565 |
):
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
if input_values is not None:
|
| 574 |
-
input_values = input_values.to(self.device)
|
| 575 |
-
audio_encoder_hidden_states = self._forward_audio_encoder(
|
| 576 |
-
input_values, audio_length=audio_length
|
| 577 |
-
)
|
| 578 |
-
else:
|
| 579 |
-
audio_encoder_hidden_states = None
|
| 580 |
-
inputs_embeds = self._prepare_with_input_ids(
|
| 581 |
-
input_ids=input_ids,
|
| 582 |
-
audio_embeddings=audio_encoder_hidden_states,
|
| 583 |
-
audio_token_id=audio_token_id,
|
| 584 |
-
)
|
| 585 |
-
else:
|
| 586 |
-
if inputs_embeds is None:
|
| 587 |
-
raise ValueError(
|
| 588 |
-
"Either `input_ids` or `inputs_embeds` must be passed."
|
| 589 |
-
)
|
| 590 |
-
if input_values is not None:
|
| 591 |
-
raise ValueError(
|
| 592 |
-
"Cannot pass `input_values` when `inputs_embeds` is provided."
|
| 593 |
-
)
|
| 594 |
-
|
| 595 |
return self.decoder.generate(
|
| 596 |
inputs_embeds=inputs_embeds,
|
| 597 |
generation_config=kwargs.pop("generation_config", self.generation_config),
|
|
|
|
| 474 |
|
| 475 |
return encoder_out
|
| 476 |
|
| 477 |
+
def _prepare_inputs_embeds(
|
| 478 |
self,
|
| 479 |
+
input_ids: Optional[torch.Tensor],
|
| 480 |
+
input_values: Optional[torch.Tensor],
|
| 481 |
+
inputs_embeds: Optional[torch.Tensor],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 482 |
audio_length: Optional[Iterable[int]] = None,
|
| 483 |
audio_token_id: Optional[int] = None,
|
| 484 |
+
) -> torch.Tensor:
|
|
|
|
| 485 |
if input_ids is not None:
|
| 486 |
if inputs_embeds is not None:
|
| 487 |
raise ValueError(
|
| 488 |
"Both `inputs_embeds` and `input_ids` are passed. Please pass only one of them."
|
| 489 |
)
|
| 490 |
+
inputs_embeds = cast(
|
| 491 |
+
torch.Tensor, self.decoder.model.embed_tokens(input_ids)
|
| 492 |
+
)
|
| 493 |
|
| 494 |
if input_values is not None:
|
| 495 |
if audio_token_id is None:
|
|
|
|
| 497 |
"If `input_values` is provided, `audio_token_id` must also be provided."
|
| 498 |
)
|
| 499 |
|
| 500 |
+
audio_embeddings = self._forward_audio_encoder(
|
| 501 |
+
input_values,
|
| 502 |
+
audio_length=audio_length,
|
| 503 |
+
).to(inputs_embeds.dtype)
|
| 504 |
+
|
| 505 |
+
audio_mask = (input_ids == audio_token_id).flatten()
|
| 506 |
+
diff = torch.diff(
|
| 507 |
+
audio_mask.long(),
|
| 508 |
+
prepend=torch.zeros(
|
| 509 |
+
(1,),
|
| 510 |
+
dtype=torch.long,
|
| 511 |
+
device=audio_mask.device,
|
| 512 |
+
),
|
| 513 |
)
|
| 514 |
+
audio_span_starts = (diff == 1).nonzero()
|
| 515 |
+
audio_span_ends = (diff == -1).nonzero()
|
| 516 |
+
|
| 517 |
+
embeds_view = inputs_embeds.view(-1, inputs_embeds.shape[-1])
|
| 518 |
+
for span_start, span_end, audio in zip(
|
| 519 |
+
audio_span_starts,
|
| 520 |
+
audio_span_ends,
|
| 521 |
+
audio_embeddings,
|
| 522 |
+
strict=True,
|
| 523 |
+
):
|
| 524 |
+
embeds_view[span_start:span_end] = audio[: span_end - span_start]
|
| 525 |
else:
|
| 526 |
if inputs_embeds is None:
|
| 527 |
raise ValueError(
|
|
|
|
| 532 |
"Cannot pass `input_values` when `inputs_embeds` is provided."
|
| 533 |
)
|
| 534 |
|
| 535 |
+
return inputs_embeds
|
| 536 |
+
|
| 537 |
+
def forward(
|
| 538 |
+
self,
|
| 539 |
+
input_ids: Optional[Tensor] = None,
|
| 540 |
+
input_values: Optional[Tensor] = None,
|
| 541 |
+
inputs_embeds: Optional[Tensor] = None,
|
| 542 |
+
audio_length: Optional[Iterable[int]] = None,
|
| 543 |
+
audio_token_id: Optional[int] = None,
|
| 544 |
+
**kwargs: Any,
|
| 545 |
+
):
|
| 546 |
+
inputs_embeds = self._prepare_inputs_embeds(
|
| 547 |
+
input_ids=input_ids,
|
| 548 |
+
input_values=input_values,
|
| 549 |
+
inputs_embeds=inputs_embeds,
|
| 550 |
+
audio_length=audio_length,
|
| 551 |
+
audio_token_id=audio_token_id,
|
| 552 |
+
)
|
| 553 |
return self.decoder(
|
| 554 |
input_ids=None,
|
| 555 |
inputs_embeds=inputs_embeds,
|
|
|
|
| 565 |
audio_token_id: Optional[int] = None,
|
| 566 |
**kwargs,
|
| 567 |
):
|
| 568 |
+
inputs_embeds = self._prepare_inputs_embeds(
|
| 569 |
+
input_ids=input_ids,
|
| 570 |
+
input_values=input_values,
|
| 571 |
+
inputs_embeds=inputs_embeds,
|
| 572 |
+
audio_length=audio_length,
|
| 573 |
+
audio_token_id=audio_token_id,
|
| 574 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 575 |
return self.decoder.generate(
|
| 576 |
inputs_embeds=inputs_embeds,
|
| 577 |
generation_config=kwargs.pop("generation_config", self.generation_config),
|