File size: 6,194 Bytes
14be0b9
 
 
 
 
 
 
 
 
 
 
29e22da
14be0b9
29e22da
 
 
 
 
 
 
 
14be0b9
 
 
29e22da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14be0b9
29e22da
 
 
 
 
 
 
 
 
 
 
14be0b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29e22da
14be0b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import gradio as gr
import datetime
import base64
import numpy as np
import dashscope
import os

API_KEY = os.environ['API_KEY']

VOICE_OPTIONS = {
    "Cherry / 芊悦": "Cherry",
    "Serena / 苏瑶": "Serena",
    "Ethan / 晨煦": "Ethan",
    "Chelsie / 千雪": "Chelsie",
    "Momo / 茉兔": "Momo",
    "Vivian / 十三": "Vivian",
    "Moon / 月白": "Moon",
    "Maia / 四月": "Maia",
    "Kai / 凯": "Kai",
    "Nofish / 不吃鱼": "Nofish",
    "Bella / 萌宝": "Bella",
    "Jennifer / 詹妮弗": "Jennifer",
    "Ryan / 甜茶": "Ryan",
    "Katerina / 卡捷琳娜": "Katerina",
    "Aiden / 艾登": "Aiden",
    "Bodega / 西班牙语-博德加": "Bodega",
    "Alek / 俄语-阿列克": "Alek",
    "Dolce / 意大利语-多尔切": "Dolce",
    "Sohee / 韩语-素熙": "Sohee",
    "Ono Anna / 日语-小野杏": "Ono Anna",
    "Lenn / 德语-莱恩": "Lenn",
    "Sonrisa / 西班牙语拉美-索尼莎": "Sonrisa",
    "Emilien / 法语-埃米尔安": "Emilien",
    "Andre / 葡萄牙语欧-安德雷": "Andre",
    "Radio Gol / 葡萄牙语巴-拉迪奥·戈尔": "Radio Gol",
    "Eldric Sage / 精品百人-沧明子": "Eldric Sage",
    "Mia / 精品百人-乖小妹": "Mia",
    "Mochi / 精品百人-沙小弥": "Mochi",
    "Bellona / 精品百人-燕铮莺": "Bellona",
    "Vincent / 精品百人-田叔": "Vincent",
    "Bunny / 精品百人-萌小姬": "Bunny",
    "Neil / 精品百人-阿闻": "Neil",
    "Elias / 墨讲师": "Elias",
    "Arthur / 精品百人-徐大爷": "Arthur",
    "Nini / 精品百人-邻家妹妹": "Nini",
    "Ebona / 精品百人-诡婆婆": "Ebona",
    "Seren / 精品百人-小婉": "Seren",
    "Pip / 精品百人-调皮小新": "Pip",
    "Stella / 精品百人-美少女阿月": "Stella",
    "Li / 南京-老李": "Li",
    "Marcus / 陕西-秦川": "Marcus",
    "Roy / 闽南-阿杰": "Roy",
    "Peter / 天津-李彼得": "Peter",
    "Eric / 四川-程川": "Eric",
    "Rocky / 粤语-阿强": "Rocky",
    "Kiki / 粤语-阿清": "Kiki",
    "Sunny / 四川-晴儿": "Sunny",
    "Jada / 上海-阿珍": "Jada",
    "Dylan / 北京-晓东": "Dylan",
}
DEFAULT_VOICE = 'Cherry / 芊悦'

LANGUAGE_OPTIONS = [
    "Auto / 自动", 
    "English / 英文", 
    "Chinese / 中文", 
    "German / 德语", 
    "Italian / 意大利语", 
    "Portuguese / 葡萄牙语", 
    "Spanish / 西班牙语", 
    "Japanese / 日语", 
    "Korean / 韩语", 
    "French / 法语", 
    "Russian / 俄语"
]

LANGUAGE_MAP = {
    "Auto / 自动": "Auto",
    "English / 英文": "English",
    "Chinese / 中文": "Chinese",
    "German / 德语": "German",
    "Italian / 意大利语": "Italian",
    "Portuguese / 葡萄牙语": "Portuguese",
    "Spanish / 西班牙语": "Spanish",
    "Japanese / 日语": "Japanese",
    "Korean / 韩语": "Korean",
    "French / 法语": "French",
    "Russian / 俄语": "Russian"
}

def tts_interface(text, voice_display, language_display):
    voice_name = VOICE_OPTIONS[voice_display]
    
    # 将显示的语言转换为API参数
    language = LANGUAGE_MAP[language_display]
    
    print(f"text: {text}, {voice_name}, {language} time: {datetime.datetime.now()}\n")

    audio_frames = []

    responses = dashscope.MultiModalConversation.call(
        api_key=API_KEY,
        model="qwen3-tts-flash-2025-11-27",
        text=text,
        voice=voice_name,
        stream=True,
        language_type=language
    )
    
    for chunk in responses:
        audio_string = ""
        try:
            audio_string = chunk.output.audio.data
        except:
            print(chunk)
            pass
        wav_bytes = base64.b64decode(audio_string)
        audio_np = np.frombuffer(wav_bytes, dtype=np.int16).astype(np.float32) / 32768.0
        audio_frames.append(audio_np)

    if audio_frames:
        full_audio = np.concatenate(audio_frames)
    else:
        full_audio = None

    sample_rate = 24000
    return (sample_rate, full_audio)

with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"]), css=".gradio-container {max-width: none !important;}") as demo:
    gr.Markdown("# 🎤 Qwen3-TTS Demo")
    
    with gr.Row():
        with gr.Column():
            # 输入文本 - 英文在前
            text_input = gr.Textbox(
                label="Input Text / 输入文本",
                placeholder="Enter text to synthesis here... / 在此输入要合成为语音的文本...",
                lines=4,
                max_lines=8
            )
            
            # 发音人选择 - 英文在前
            voice_select = gr.Dropdown(
                label="Select Voice / 选择发音人",
                choices=list(VOICE_OPTIONS.keys()),
                value=DEFAULT_VOICE
            )
            
            # 语言选择 - 英文在前
            language_select = gr.Dropdown(
                label="Select Text Language / 选择文本语言",
                choices=LANGUAGE_OPTIONS,
                value="Auto / 自动"
            )
            
            # 生成按钮 - 英文在前
            generate_btn = gr.Button("Generate Speech / 生成语音", variant="primary")
        
        with gr.Column():
            # 音频输出 - 英文在前
            audio_output = gr.Audio(label="Generated Speech / 生成的语音", interactive=False)
    
    # 示例文本 - 英文在前
    examples = gr.Examples(
        examples=[
            ["你好,我是通义千问,很高兴认识你。", "Cherry / 芊悦", "Chinese / 中文"],
            ["你好,我是通义千问,很高兴认识你。", "Dylan / 北京-晓东", "Chinese / 中文"],
            ["Hello, this is a text-to-speech demo", "Jennifer / 詹妮弗", "English / 英文"],
            ["こんにちは、これはデモです", "Cherry / 芊悦", "Japanese / 日语"],
        ],
        inputs=[text_input, voice_select, language_select],
        label="Examples / 示例文本"
    )

    generate_btn.click(
        fn=tts_interface,
        inputs=[text_input, voice_select, language_select],
        outputs=audio_output
    )

if __name__ == "__main__":
    demo.launch()