LiKenun commited on
Commit
caf2559
·
1 Parent(s): c328580

Add text-to-speech (TTS) sample

Browse files
Files changed (3) hide show
  1. app.py +11 -0
  2. requirements.txt +3 -0
  3. text_to_speech.py +11 -0
app.py CHANGED
@@ -5,6 +5,7 @@ from huggingface_hub import InferenceClient
5
  from image_classification import image_classification
6
  from image_to_text import image_to_text
7
  from text_to_image import text_to_image
 
8
  from utils import request_image
9
 
10
 
@@ -62,6 +63,16 @@ class App:
62
  inputs=image_classification_image_input,
63
  outputs=image_classification_output
64
  )
 
 
 
 
 
 
 
 
 
 
65
 
66
  demo.launch()
67
 
 
5
  from image_classification import image_classification
6
  from image_to_text import image_to_text
7
  from text_to_image import text_to_image
8
+ from text_to_speech import text_to_speech
9
  from utils import request_image
10
 
11
 
 
63
  inputs=image_classification_image_input,
64
  outputs=image_classification_output
65
  )
66
+ with gr.Tab("Text-to-speech (TTS)"):
67
+ gr.Markdown("Generate speech from a text.")
68
+ text_to_speech_text = gr.Textbox(label="Text")
69
+ text_to_speech_generate_button = gr.Button("Generate")
70
+ text_to_speech_output = gr.Audio(label="Speech")
71
+ text_to_speech_generate_button.click(
72
+ fn=text_to_speech,
73
+ inputs=text_to_speech_text,
74
+ outputs=text_to_speech_output
75
+ )
76
 
77
  demo.launch()
78
 
requirements.txt CHANGED
@@ -5,3 +5,6 @@ pandas>=2.0.0
5
  pillow>=10.0.0
6
  requests>=2.31.0
7
  transformers>=4.40.0
 
 
 
 
5
  pillow>=10.0.0
6
  requests>=2.31.0
7
  transformers>=4.40.0
8
+ timm>=1.0.0
9
+ inflect>=7.0.0
10
+ phonemizer>=3.0.0
text_to_speech.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ from transformers import pipeline
3
+ from utils import spaces_gpu
4
+
5
+ @spaces_gpu
6
+ def text_to_speech(text: str) -> tuple[int, bytes]:
7
+ narrator = pipeline("text-to-speech", "kakao-enterprise/vits-ljs")
8
+ del narrator
9
+ gc.collect()
10
+ result = narrator(text)
11
+ return (result["sampling_rate"], result["audio"][0])