Mr-HASSAN
Fix: Keep letters visible after sending to LLM, allow answer building
fadaba4
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import gradio as gr
import cv2
import numpy as np
from PIL import Image
import logging
import gc
import torch
from collections import defaultdict
try:
import spaces
SPACES_AVAILABLE = True
except ImportError:
SPACES_AVAILABLE = False
print("โš ๏ธ spaces not available, GPU decorator disabled")
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Global instances - lazy loading
detector = None
translator = None
medical_agent = None
speech_processor = None
# Session tracking: letters, words, question count, history, last detection time
import time
sessions = defaultdict(lambda: {
'letters': [], # Accumulated letters
'words': [], # Built words
'question_count': 0,
'history': [],
'last_letter': None,
'last_letter_time': 0,
'letter_stable_count': 0,
'waiting_for_answer': False, # Pause detection when LLM asks question
'current_question': '' # Store current LLM question
})
def setup_environment():
"""Setup environment for Hugging Face Spaces"""
if torch.cuda.is_available():
device = 'cuda'
logger.info("โœ… GPU available - using CUDA")
else:
device = 'cpu'
logger.info("โš ๏ธ GPU not available - using CPU")
return device
def initialize_models():
"""Initialize models with lazy loading"""
global detector, translator, medical_agent, speech_processor
logger.info("๐Ÿ”„ Initializing essential models...")
try:
# Load YOLO detector
from utils.detector import ArabicSignDetector
detector = ArabicSignDetector()
logger.info("โœ… YOLO Detector loaded")
# Clear memory
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
# Load lightweight models
from utils.speech import SpeechProcessor
speech_processor = SpeechProcessor()
logger.info("โœ… Speech Processor loaded")
logger.info("๐ŸŽ‰ Essential models loaded!")
except Exception as e:
logger.error(f"โŒ Model loading failed: {e}")
raise
def get_translator():
"""Lazy loader for translator"""
global translator
if translator is None:
try:
from utils.translator import MedicalTranslator
translator = MedicalTranslator()
logger.info("โœ… Translator loaded")
except Exception as e:
logger.error(f"โŒ Translator loading failed: {e}")
class FallbackTranslator:
def ar_to_en(self, text): return text
def en_to_ar(self, text): return text
translator = FallbackTranslator()
return translator
def get_medical_agent():
"""Lazy loader for medical agent with HuatuoGPT"""
global medical_agent
if medical_agent is None:
try:
from utils.medical_agent import HuatuoMedicalAgent
medical_agent = HuatuoMedicalAgent(max_questions=3, max_words_per_question=5)
logger.info("โœ… HuatuoGPT Medical Agent loaded")
except Exception as e:
logger.error(f"โŒ HuatuoGPT failed, using lite: {e}")
from utils.medical_agent_lite import LiteMedicalAgent
medical_agent = LiteMedicalAgent()
return medical_agent
# GPU decorator - only apply if spaces is available
def gpu_decorator(duration=30):
if SPACES_AVAILABLE:
return spaces.GPU(duration=duration)
else:
# Return identity decorator if spaces not available
return lambda f: f
@gpu_decorator(duration=30)
def process_sign_language_stream(image, session_id="default"):
"""Process sign language with automatic streaming detection and word building"""
try:
if image is None:
return "โณ ุงู†ุชุธุงุฑ ุงู„ูƒุงู…ูŠุฑุง...", "", "", ""
# Initialize session if needed
if session_id not in sessions:
sessions[session_id] = {
'letters': [],
'words': [],
'question_count': 0,
'history': [],
'last_letter': None,
'last_letter_time': 0,
'letter_stable_count': 0,
'waiting_for_answer': False,
'current_question': ''
}
session = sessions[session_id]
# Normal detection mode - process image
# Convert to numpy array
if isinstance(image, Image.Image):
image_np = np.array(image)
else:
image_np = image
# Convert RGB to BGR for OpenCV
if len(image_np.shape) == 3 and image_np.shape[2] == 3:
image_np = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
# Detect Arabic letters
detection_result = detector.detect_letters(image_np)
current_time = time.time()
if not detection_result['success']:
# No detection - keep showing current state, don't reset
if session['letters']:
current_word_ar = ''.join(session['letters'])
translator_instance = get_translator()
current_word_en = translator_instance.ar_to_en(current_word_ar)
word_display = f"๐Ÿ”ค ุงู„ูƒู„ู…ุฉ: {current_word_ar}\n๐Ÿ“Š ุงู„ุญุฑูˆู: {' + '.join(session['letters'])}"
translation_display = f"๐ŸŒ ุงู„ุชุฑุฌู…ุฉ: {current_word_en}"
status_display = "โธ๏ธ ู„ุง ุชูˆุฌุฏ ุฅุดุงุฑุฉ - ุฃุธู‡ุฑ ุงู„ุญุฑู ุงู„ุชุงู„ูŠ ุฃูˆ ุงุถุบุท 'ุฅูƒู…ุงู„ ุงู„ูƒู„ู…ุฉ'"
return status_display, word_display, translation_display, "๐Ÿ’ก ู†ุตูŠุญุฉ: ุงุถุบุท 'ุฅูƒู…ุงู„ ุงู„ูƒู„ู…ุฉ' ู„ู„ุญุตูˆู„ ุนู„ู‰ ุฑุฏ ุทุจูŠ"
return "๐Ÿ” ุฌุงู‡ุฒ ู„ู„ูƒุดู - ุฃุธู‡ุฑ ุฅุดุงุฑุฉ ูŠุฏ", "", "", "๐ŸŽฅ ุงุจุฏุฃ ุจุฅุธู‡ุงุฑ ุฅุดุงุฑุงุช ุงู„ูŠุฏ"
# Get detected letter (take first/highest confidence)
detected_letters = detection_result['letters']
confidences = detection_result.get('confidences', [])
if not detected_letters:
return "๐Ÿ” ุฌุงู‡ุฒ ู„ู„ูƒุดู...", "", "", ""
current_letter = detected_letters[0] # Highest confidence letter
current_confidence = confidences[0] if confidences else 0.0
# Letter stabilization: only add if held steady
if current_letter == session['last_letter']:
session['letter_stable_count'] += 1
else:
session['last_letter'] = current_letter
session['letter_stable_count'] = 1
session['last_letter_time'] = current_time
# Add letter after it's been stable for 2 frames (~1 second at 0.5s stream)
if session['letter_stable_count'] >= 2:
# Check if it's not a duplicate of the last added letter
if not session['letters'] or session['letters'][-1] != current_letter:
session['letters'].append(current_letter)
session['letter_stable_count'] = 0 # Reset counter
logger.info(f"๐Ÿ“ Added letter: {current_letter}")
# Build current word from accumulated letters
current_word_ar = ''.join(session['letters'])
# Translate word to English
translator_instance = get_translator()
current_word_en = translator_instance.ar_to_en(current_word_ar) if current_word_ar else ""
# Update last letter time when actively detecting
session['last_letter_time'] = current_time
# Format detection display
stability_bar = "๐ŸŸข" * session['letter_stable_count'] + "โšช" * (2 - session['letter_stable_count'])
detected_info_ar = f"๐ŸŽฏ ุงู„ุญุฑู ุงู„ุญุงู„ูŠ: {current_letter} ({current_confidence:.0%})\n{stability_bar} ุซุจุงุช: {session['letter_stable_count']}/2 (~1 ุซุงู†ูŠุฉ)"
# Format word display in Arabic
word_display_ar = f"๐Ÿ”ค ุงู„ูƒู„ู…ุฉ: {current_word_ar if current_word_ar else '...'}"
if session['letters']:
word_display_ar += f"\n๐Ÿ“Š ุงู„ุญุฑูˆู: {' + '.join(session['letters'])}"
# Format translation in Arabic
translation_display_ar = f"๐ŸŒ ุงู„ุชุฑุฌู…ุฉ: {current_word_en}" if current_word_en else "โณ ุฃูƒู…ู„ ุงู„ูƒู„ู…ุฉ..."
# If waiting for answer, show question + allow building answer word
response_display_ar = ""
if session['waiting_for_answer']:
# Show question while building answer
response_display_ar = session['current_question']
if len(session['letters']) >= 1:
response_display_ar += f"\n\nโœ๏ธ ุฅุฌุงุจุชูƒ: {current_word_ar} ({len(session['letters'])} ุญุฑูˆู)"
else:
response_display_ar += "\n\n๐Ÿ‘‰ ุฃุธู‡ุฑ ุฅุฌุงุจุชูƒ ุจุงู„ุฅุดุงุฑุงุช"
# Normal mode - show hint to press button when word is ready
elif len(session['letters']) >= 3 and current_word_en:
response_display_ar = f"โœ… ุงู„ูƒู„ู…ุฉ ุฌุงู‡ุฒุฉ!\n๐Ÿ’ก ุงุถุบุท 'ุฅุฑุณุงู„ ู„ู„ุฐูƒุงุก ุงู„ุงุตุทู†ุงุนูŠ' ู„ู„ุชุญู„ูŠู„"
elif session['letters']:
response_display_ar = f"โณ ุงุณุชู…ุฑ ููŠ ุงู„ุฅุดุงุฑุงุช... ({len(session['letters'])} ุญุฑูˆู ุญุชู‰ ุงู„ุขู†)"
else:
response_display_ar = "๐ŸŽฅ ุงุจุฏุฃ ุจุฅุธู‡ุงุฑ ุฅุดุงุฑุงุช ุงู„ูŠุฏ"
return detected_info_ar, word_display_ar, translation_display_ar, response_display_ar
except Exception as e:
logger.error(f"Error processing sign: {e}")
import traceback
traceback.print_exc()
return f"โŒ ุฎุทุฃ: {str(e)}", "", "", "ุงู„ุฑุฌุงุก ุงู„ู…ุญุงูˆู„ุฉ ู…ุฑุฉ ุฃุฎุฑู‰"
def process_doctor_audio(audio, session_id="default"):
"""Process doctor's audio input"""
try:
if audio is None:
return "โŒ No audio provided", ""
# Convert audio to text
doctor_text = speech_processor.speech_to_text(audio)
logger.info(f"๐ŸŽค Doctor said: {doctor_text}")
# Get medical agent
medical_agent_instance = get_medical_agent()
patient_question = medical_agent_instance.process_doctor_input(doctor_text)
# Translate to Arabic
translator_instance = get_translator()
arabic_question = translator_instance.en_to_ar(patient_question)
return f"๐ŸŽค You said: {doctor_text}", f"โ“ Question for patient: {arabic_question}"
except Exception as e:
logger.error(f"Error processing audio: {e}")
return f"โŒ Error: {str(e)}", ""
def reset_session(session_id="default"):
"""Reset conversation session and clear accumulated letters/words"""
if session_id in sessions:
del sessions[session_id]
return "๐Ÿ”„ ุชู… ุฅุนุงุฏุฉ ุชุนูŠูŠู† ุงู„ุฌู„ุณุฉ ุจู†ุฌุงุญ!\n\nโœ… Session reset - all letters and words cleared!"
def complete_word(session_id="default"):
"""Send word to HuatuoGPT, pause detection, show LLM question"""
if session_id not in sessions:
return "โš ๏ธ ู„ุง ุชูˆุฌุฏ ุฌู„ุณุฉ - ุงุจุฏุฃ ุจุฅุธู‡ุงุฑ ุฅุดุงุฑุงุช"
session = sessions[session_id]
# If already waiting for answer, this button saves answer and resumes detection
if session['waiting_for_answer']:
# Save answer word if there are letters
if session['letters']:
answer_word_ar = ''.join(session['letters'])
session['words'].append(answer_word_ar)
result_msg = f"โœ… ุชู… ุญูุธ ุงู„ุฅุฌุงุจุฉ: {answer_word_ar}"
else:
result_msg = "โš ๏ธ ู„ู… ูŠุชู… ุฅุฏุฎุงู„ ุฅุฌุงุจุฉ"
# Clear and resume detection
session['letters'] = []
session['last_letter'] = None
session['letter_stable_count'] = 0
session['waiting_for_answer'] = False
session['current_question'] = ''
return f"{result_msg}\n\n๐Ÿ”„ ุงู„ูƒุดู ู…ุณุชุฃู†ู - ุฌุงู‡ุฒ ู„ู„ูƒู„ู…ุฉ ุงู„ุชุงู„ูŠุฉ!"
# Check if we have letters to send
if not session['letters']:
return "โš ๏ธ ู„ุง ุชูˆุฌุฏ ุฃุญุฑู - ุฃุธู‡ุฑ ุฅุดุงุฑุงุช ุฃูˆู„ุงู‹"
# First time: Send word to LLM
current_word_ar = ''.join(session['letters'])
session['words'].append(current_word_ar)
# Translate to English
translator_instance = get_translator()
current_word_en = translator_instance.ar_to_en(current_word_ar)
logger.info(f"๐Ÿ“ค Sending to HuatuoGPT: {current_word_ar} โ†’ {current_word_en}")
# Get medical response from HuatuoGPT
medical_agent_instance = get_medical_agent()
agent_response = medical_agent_instance.process_input(
current_word_en,
session_id=session_id
)
# Translate response to Arabic
arabic_medical_response = translator_instance.en_to_ar(agent_response['response'])
# Update session
session['question_count'] = agent_response['question_count']
session['history'].append(f"ุงู„ู…ุฑูŠุถ: {current_word_ar} ({current_word_en})")
session['history'].append(f"ุงู„ุทุจูŠุจ: {arabic_medical_response}")
# Pause detection and store question
session['waiting_for_answer'] = True
session['current_question'] = f"๐Ÿ‘จโ€โš•๏ธ ุงู„ุทุจูŠุจ ({agent_response['question_count']}/3):\n{arabic_medical_response}\n\nโธ๏ธ ุงู„ูƒุดู ู…ุชูˆู‚ู - ุฃุธู‡ุฑ ุฅุฌุงุจุชูƒ ุซู… ุงุถุบุท ุงู„ุฒุฑ ู…ุฑุฉ ุฃุฎุฑู‰"
# DON'T clear letters yet - keep them visible
# Only clear last_letter tracking for new word detection
session['last_letter'] = None
session['letter_stable_count'] = 0
return f"โœ… ุชู… ุฅุฑุณุงู„: {current_word_ar} โ†’ {current_word_en}\n\n๐Ÿค– HuatuoGPT ูŠุญู„ู„...\n\n{session['current_question']}"
def create_interface():
"""Create Gradio interface"""
with gr.Blocks(title="Arabic Sign Language Medical Interpreter") as app:
gr.Markdown(
"""
# ๐Ÿฅ Arabic Sign Language Medical Interpreter
This system helps deaf patients communicate with doctors using Arabic sign language.
## ๐ŸŽฏ How to use:
1. **Patient**: Show Arabic sign language to the camera
2. **System**: Detects signs, translates, and provides medical questions
3. **Doctor**: Can also speak questions which will be converted for the patient
"""
)
session_id = gr.State(value="default_session")
with gr.Tab("๐Ÿ“น Sign Language Detection"):
gr.Markdown("""
### ๐ŸŽฅ Real-Time Sign Detection / ุงู„ูƒุดู ููŠ ุงู„ูˆู‚ุช ุงู„ูุนู„ูŠ
**๐Ÿ”„ Workflow / ุณูŠุฑ ุงู„ุนู…ู„:**
1. **YOLO detects** - ุฃุธู‡ุฑ ุฅุดุงุฑุงุช ู„ุจู†ุงุก ูƒู„ู…ุฉ (ุซุงู†ูŠุฉ ู„ูƒู„ ุญุฑู)
2. **Press button** - ุงุถุบุท ุงู„ุฒุฑ ู„ุฅุฑุณุงู„ ุงู„ูƒู„ู…ุฉ
3. **YOLO pauses** - ูŠุชูˆู‚ู ุงู„ูƒุดู ุชู„ู‚ุงุฆูŠุงู‹
4. **HuatuoGPT analyzes** - ุงู„ุฐูƒุงุก ูŠุญู„ู„ ูˆูŠุณุฃู„
5. **Question shown** - ุงู„ุณุคุงู„ ูŠุธู‡ุฑ (ู„ุง ูŠุฎุชููŠ!)
6. **Show answer signs** - ุฃุธู‡ุฑ ุฅุฌุงุจุชูƒ
7. **Press button again** - ู„ู„ู…ุชุงุจุนุฉ
๐Ÿšจ **Detection pauses during LLM questions - question stays visible!**
""")
with gr.Row():
with gr.Column():
image_input = gr.Image(
sources=["webcam"],
type="pil",
label="๐Ÿ“น Live Camera Feed / ูƒุงู…ูŠุฑุง ู…ุจุงุดุฑุฉ",
streaming=True
)
with gr.Row():
complete_word_btn = gr.Button("๐Ÿค– ุฅุฑุณุงู„ ู„ู„ุฐูƒุงุก (Send to AI)", variant="primary", size="lg")
clear_btn = gr.Button("๐Ÿ”„ ู…ุณุญ (Clear All)", variant="secondary")
with gr.Column():
detected_output = gr.Textbox(
label="โœ… ู†ุชุงุฆุฌ ุงู„ูƒุดู / Detection Results",
lines=3,
placeholder="ุณุชุธู‡ุฑ ุงู„ุญุฑูˆู ุงู„ู…ูƒุชุดูุฉ ู‡ู†ุง / Detected letters will appear here..."
)
arabic_output = gr.Textbox(
label="๐Ÿ”ค ุงู„ูƒู„ู…ุฉ ุงู„ุนุฑุจูŠุฉ / Arabic Word",
lines=2,
placeholder="ุงู„ูƒู„ู…ุฉ ุงู„ู…ุชุฑุงูƒู…ุฉ / Accumulated word..."
)
english_output = gr.Textbox(
label="๐ŸŒ ุงู„ุชุฑุฌู…ุฉ / Translation",
lines=2,
placeholder="ุงู„ุชุฑุฌู…ุฉ ุงู„ุฅู†ุฌู„ูŠุฒูŠุฉ / English translation..."
)
response_output = gr.Textbox(
label="๐Ÿ‘จโ€โš•๏ธ ุงุณุชุฌุงุจุฉ ุงู„ุทุจูŠุจ / Medical AI Response",
lines=5,
placeholder="ุณุชุธู‡ุฑ ุงู„ุฃุณุฆู„ุฉ ุงู„ุทุจูŠุฉ ู‡ู†ุง / Medical questions will appear here..."
)
word_status = gr.Textbox(
label="๐Ÿ“Š ุญุงู„ุฉ ุงู„ูƒู„ู…ุฉ / Word Status",
lines=2,
placeholder="Word completion status..."
)
gr.Markdown("""
### ๐Ÿ’ก Tips for Better Detection / ู†ุตุงุฆุญ ู„ู„ูƒุดู ุงู„ุฃูุถู„:
- **ุฅุถุงุกุฉ ุฌูŠุฏุฉ / Good Lighting**: Ensure your hands are well-lit
- **ุฎู„ููŠุฉ ูˆุงุถุญุฉ / Clear Background**: Use a plain background
- **ูˆุถุน ุงู„ูŠุฏ / Hand Position**: Keep hands centered in view
- **ูˆุถูˆุญ ุงู„ุฅุดุงุฑุฉ / Sign Clarity**: Make distinct, clear signs
- **ุงู„ู…ุณุงูุฉ / Distance**: Comfortable distance from camera
- **ุซุจุงุช / Stability**: Hold each sign steady for ~1 second (wait for ๐ŸŸข๐ŸŸข)
- **ู„ุง ุชุนูŠุฏ ุชุนูŠูŠู† / Don't Reset**: Move hand away between letters - word stays!
""")
# Auto-streaming detection - no manual button click needed
image_input.stream(
fn=process_sign_language_stream,
inputs=[image_input, session_id],
outputs=[detected_output, arabic_output, english_output, response_output],
stream_every=0.5 # Process every 0.5 seconds
)
complete_word_btn.click(
fn=complete_word,
inputs=[session_id],
outputs=[word_status]
)
def clear_all(session_id):
if session_id in sessions:
sessions[session_id]['letters'] = []
sessions[session_id]['last_letter'] = None
sessions[session_id]['letter_stable_count'] = 0
return "", "", "", "", ""
clear_btn.click(
fn=clear_all,
inputs=[session_id],
outputs=[detected_output, arabic_output, english_output, response_output, word_status]
)
with gr.Tab("๐ŸŽค Doctor's Voice Input"):
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["microphone"],
type="filepath",
label="Doctor's Voice"
)
audio_btn = gr.Button("๐ŸŽค Process Audio", variant="primary", size="lg")
with gr.Column():
doctor_text_output = gr.Textbox(label="๐ŸŽค Transcribed Text", lines=3)
question_output = gr.Textbox(label="โ“ Question for Patient (Arabic)", lines=3)
audio_btn.click(
fn=process_doctor_audio,
inputs=[audio_input, session_id],
outputs=[doctor_text_output, question_output]
)
with gr.Tab("โ„น๏ธ System Info"):
gr.Markdown(
"""
## ๐Ÿ“Š System Features:
- **YOLO-based** Arabic sign language detection
- **Real-time** translation (Arabic โ†” English)
- **Medical AI** for intelligent questioning
- **ZeroGPU** optimization for efficient processing
## ๐Ÿ”ง Technical Stack:
- YOLOv8 for sign detection
- Helsinki-NLP for translation
- Whisper for speech recognition
- gTTS for text-to-speech
## ๐Ÿ’ก Tips:
- Ensure good lighting for better detection
- Make clear, distinct sign gestures
- Speak clearly into the microphone
"""
)
reset_btn = gr.Button("๐Ÿ”„ Reset Session", variant="secondary")
reset_output = gr.Textbox(label="Status", lines=1)
reset_btn.click(
fn=reset_session,
inputs=[session_id],
outputs=[reset_output]
)
gr.Markdown(
"""
---
Built with โค๏ธ for accessible healthcare communication
"""
)
return app
# Initialize and launch
if __name__ == "__main__":
logger.info("๐Ÿš€ Starting Arabic Sign Language Medical Interpreter...")
# Setup environment
setup_environment()
# Initialize models
initialize_models()
# Create and launch interface
app = create_interface()
app.queue()
app.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)