| { | |
| "architecture": "chatterbox_turbo", | |
| "dec_cond_len_seconds": 10, | |
| "enc_cond_len_seconds": 15, | |
| "gpt2": { | |
| "activation_function": "gelu_new", | |
| "n_ctx": 8196, | |
| "n_embd": 1024, | |
| "hidden_size": 1024, | |
| "n_head": 16, | |
| "n_layer": 24, | |
| "n_positions": 8196, | |
| "vocab_size": 50276, | |
| "layer_norm_epsilon": 1e-05, | |
| "attn_pdrop": 0.1, | |
| "embd_pdrop": 0.1, | |
| "resid_pdrop": 0.1 | |
| }, | |
| "model_type": "chatterbox_turbo", | |
| "quantization": { | |
| "group_size": 64, | |
| "bits": 5, | |
| "mode": "affine" | |
| }, | |
| "quantization_config": { | |
| "group_size": 64, | |
| "bits": 5, | |
| "mode": "affine" | |
| }, | |
| "s3gen": { | |
| "output_sample_rate": 24000, | |
| "input_sample_rate": 16000, | |
| "silence_token": 4299, | |
| "speech_vocab_size": 6561, | |
| "meanflow": true, | |
| "token_embedding_dim": 512, | |
| "encoder_attention_heads": 8, | |
| "encoder_linear_units": 2048, | |
| "encoder_num_blocks": 6, | |
| "encoder_dropout_rate": 0.1, | |
| "decoder_in_channels": 320, | |
| "decoder_out_channels": 80, | |
| "decoder_channels": [ | |
| 256 | |
| ], | |
| "decoder_attention_head_dim": 64, | |
| "decoder_n_blocks": 4, | |
| "decoder_num_mid_blocks": 12, | |
| "decoder_num_heads": 8, | |
| "cfm_sigma_min": 1e-06, | |
| "cfm_t_scheduler": "cosine", | |
| "cfm_inference_cfg_rate": 0.7 | |
| }, | |
| "sample_rate": 24000, | |
| "t3": { | |
| "start_text_token": 255, | |
| "stop_text_token": 0, | |
| "text_tokens_dict_size": 50276, | |
| "max_text_tokens": 2048, | |
| "start_speech_token": 6561, | |
| "stop_speech_token": 6562, | |
| "speech_tokens_dict_size": 6563, | |
| "max_speech_tokens": 4096, | |
| "llama_config_name": "GPT2_medium", | |
| "input_pos_emb": null, | |
| "speech_cond_prompt_len": 375, | |
| "encoder_type": "voice_encoder", | |
| "speaker_embed_size": 256, | |
| "use_perceiver_resampler": false, | |
| "emotion_adv": false | |
| }, | |
| "voice_encoder": { | |
| "num_mels": 40, | |
| "sample_rate": 16000, | |
| "speaker_embed_size": 256, | |
| "ve_hidden_size": 256, | |
| "flatten_lstm_params": false, | |
| "n_fft": 400, | |
| "hop_size": 160, | |
| "win_size": 400, | |
| "fmax": 8000, | |
| "fmin": 0, | |
| "preemphasis": 0.0, | |
| "mel_power": 2.0, | |
| "mel_type": "amp", | |
| "normalized_mels": false, | |
| "ve_partial_frames": 160, | |
| "ve_final_relu": true, | |
| "stft_magnitude_min": 0.0001 | |
| } | |
| } |