Spaces:
Sleeping
Sleeping
| import os | |
| os.environ['CUDA_VISIBLE_DEVICES'] = '0' | |
| import gradio as gr | |
| import cv2 | |
| import numpy as np | |
| from PIL import Image | |
| import logging | |
| import gc | |
| import torch | |
| from collections import defaultdict | |
| try: | |
| import spaces | |
| SPACES_AVAILABLE = True | |
| except ImportError: | |
| SPACES_AVAILABLE = False | |
| print("โ ๏ธ spaces not available, GPU decorator disabled") | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Global instances - lazy loading | |
| detector = None | |
| translator = None | |
| medical_agent = None | |
| speech_processor = None | |
| # Session tracking: letters, words, question count, history, last detection time | |
| import time | |
| sessions = defaultdict(lambda: { | |
| 'letters': [], # Accumulated letters | |
| 'words': [], # Built words | |
| 'question_count': 0, | |
| 'history': [], | |
| 'last_letter': None, | |
| 'last_letter_time': 0, | |
| 'letter_stable_count': 0, | |
| 'waiting_for_answer': False, # Pause detection when LLM asks question | |
| 'current_question': '' # Store current LLM question | |
| }) | |
| def setup_environment(): | |
| """Setup environment for Hugging Face Spaces""" | |
| if torch.cuda.is_available(): | |
| device = 'cuda' | |
| logger.info("โ GPU available - using CUDA") | |
| else: | |
| device = 'cpu' | |
| logger.info("โ ๏ธ GPU not available - using CPU") | |
| return device | |
| def initialize_models(): | |
| """Initialize models with lazy loading""" | |
| global detector, translator, medical_agent, speech_processor | |
| logger.info("๐ Initializing essential models...") | |
| try: | |
| # Load YOLO detector | |
| from utils.detector import ArabicSignDetector | |
| detector = ArabicSignDetector() | |
| logger.info("โ YOLO Detector loaded") | |
| # Clear memory | |
| gc.collect() | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| # Load lightweight models | |
| from utils.speech import SpeechProcessor | |
| speech_processor = SpeechProcessor() | |
| logger.info("โ Speech Processor loaded") | |
| logger.info("๐ Essential models loaded!") | |
| except Exception as e: | |
| logger.error(f"โ Model loading failed: {e}") | |
| raise | |
| def get_translator(): | |
| """Lazy loader for translator""" | |
| global translator | |
| if translator is None: | |
| try: | |
| from utils.translator import MedicalTranslator | |
| translator = MedicalTranslator() | |
| logger.info("โ Translator loaded") | |
| except Exception as e: | |
| logger.error(f"โ Translator loading failed: {e}") | |
| class FallbackTranslator: | |
| def ar_to_en(self, text): return text | |
| def en_to_ar(self, text): return text | |
| translator = FallbackTranslator() | |
| return translator | |
| def get_medical_agent(): | |
| """Lazy loader for medical agent with HuatuoGPT""" | |
| global medical_agent | |
| if medical_agent is None: | |
| try: | |
| from utils.medical_agent import HuatuoMedicalAgent | |
| medical_agent = HuatuoMedicalAgent(max_questions=3, max_words_per_question=5) | |
| logger.info("โ HuatuoGPT Medical Agent loaded") | |
| except Exception as e: | |
| logger.error(f"โ HuatuoGPT failed, using lite: {e}") | |
| from utils.medical_agent_lite import LiteMedicalAgent | |
| medical_agent = LiteMedicalAgent() | |
| return medical_agent | |
| # GPU decorator - only apply if spaces is available | |
| def gpu_decorator(duration=30): | |
| if SPACES_AVAILABLE: | |
| return spaces.GPU(duration=duration) | |
| else: | |
| # Return identity decorator if spaces not available | |
| return lambda f: f | |
| def process_sign_language_stream(image, session_id="default"): | |
| """Process sign language with automatic streaming detection and word building""" | |
| try: | |
| if image is None: | |
| return "โณ ุงูุชุธุงุฑ ุงููุงู ูุฑุง...", "", "", "" | |
| # Initialize session if needed | |
| if session_id not in sessions: | |
| sessions[session_id] = { | |
| 'letters': [], | |
| 'words': [], | |
| 'question_count': 0, | |
| 'history': [], | |
| 'last_letter': None, | |
| 'last_letter_time': 0, | |
| 'letter_stable_count': 0, | |
| 'waiting_for_answer': False, | |
| 'current_question': '' | |
| } | |
| session = sessions[session_id] | |
| # Normal detection mode - process image | |
| # Convert to numpy array | |
| if isinstance(image, Image.Image): | |
| image_np = np.array(image) | |
| else: | |
| image_np = image | |
| # Convert RGB to BGR for OpenCV | |
| if len(image_np.shape) == 3 and image_np.shape[2] == 3: | |
| image_np = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR) | |
| # Detect Arabic letters | |
| detection_result = detector.detect_letters(image_np) | |
| current_time = time.time() | |
| if not detection_result['success']: | |
| # No detection - keep showing current state, don't reset | |
| if session['letters']: | |
| current_word_ar = ''.join(session['letters']) | |
| translator_instance = get_translator() | |
| current_word_en = translator_instance.ar_to_en(current_word_ar) | |
| word_display = f"๐ค ุงูููู ุฉ: {current_word_ar}\n๐ ุงูุญุฑูู: {' + '.join(session['letters'])}" | |
| translation_display = f"๐ ุงูุชุฑุฌู ุฉ: {current_word_en}" | |
| status_display = "โธ๏ธ ูุง ุชูุฌุฏ ุฅุดุงุฑุฉ - ุฃุธูุฑ ุงูุญุฑู ุงูุชุงูู ุฃู ุงุถุบุท 'ุฅูู ุงู ุงูููู ุฉ'" | |
| return status_display, word_display, translation_display, "๐ก ูุตูุญุฉ: ุงุถุบุท 'ุฅูู ุงู ุงูููู ุฉ' ููุญุตูู ุนูู ุฑุฏ ุทุจู" | |
| return "๐ ุฌุงูุฒ ูููุดู - ุฃุธูุฑ ุฅุดุงุฑุฉ ูุฏ", "", "", "๐ฅ ุงุจุฏุฃ ุจุฅุธูุงุฑ ุฅุดุงุฑุงุช ุงููุฏ" | |
| # Get detected letter (take first/highest confidence) | |
| detected_letters = detection_result['letters'] | |
| confidences = detection_result.get('confidences', []) | |
| if not detected_letters: | |
| return "๐ ุฌุงูุฒ ูููุดู...", "", "", "" | |
| current_letter = detected_letters[0] # Highest confidence letter | |
| current_confidence = confidences[0] if confidences else 0.0 | |
| # Letter stabilization: only add if held steady | |
| if current_letter == session['last_letter']: | |
| session['letter_stable_count'] += 1 | |
| else: | |
| session['last_letter'] = current_letter | |
| session['letter_stable_count'] = 1 | |
| session['last_letter_time'] = current_time | |
| # Add letter after it's been stable for 2 frames (~1 second at 0.5s stream) | |
| if session['letter_stable_count'] >= 2: | |
| # Check if it's not a duplicate of the last added letter | |
| if not session['letters'] or session['letters'][-1] != current_letter: | |
| session['letters'].append(current_letter) | |
| session['letter_stable_count'] = 0 # Reset counter | |
| logger.info(f"๐ Added letter: {current_letter}") | |
| # Build current word from accumulated letters | |
| current_word_ar = ''.join(session['letters']) | |
| # Translate word to English | |
| translator_instance = get_translator() | |
| current_word_en = translator_instance.ar_to_en(current_word_ar) if current_word_ar else "" | |
| # Update last letter time when actively detecting | |
| session['last_letter_time'] = current_time | |
| # Format detection display | |
| stability_bar = "๐ข" * session['letter_stable_count'] + "โช" * (2 - session['letter_stable_count']) | |
| detected_info_ar = f"๐ฏ ุงูุญุฑู ุงูุญุงูู: {current_letter} ({current_confidence:.0%})\n{stability_bar} ุซุจุงุช: {session['letter_stable_count']}/2 (~1 ุซุงููุฉ)" | |
| # Format word display in Arabic | |
| word_display_ar = f"๐ค ุงูููู ุฉ: {current_word_ar if current_word_ar else '...'}" | |
| if session['letters']: | |
| word_display_ar += f"\n๐ ุงูุญุฑูู: {' + '.join(session['letters'])}" | |
| # Format translation in Arabic | |
| translation_display_ar = f"๐ ุงูุชุฑุฌู ุฉ: {current_word_en}" if current_word_en else "โณ ุฃูู ู ุงูููู ุฉ..." | |
| # If waiting for answer, show question + allow building answer word | |
| response_display_ar = "" | |
| if session['waiting_for_answer']: | |
| # Show question while building answer | |
| response_display_ar = session['current_question'] | |
| if len(session['letters']) >= 1: | |
| response_display_ar += f"\n\nโ๏ธ ุฅุฌุงุจุชู: {current_word_ar} ({len(session['letters'])} ุญุฑูู)" | |
| else: | |
| response_display_ar += "\n\n๐ ุฃุธูุฑ ุฅุฌุงุจุชู ุจุงูุฅุดุงุฑุงุช" | |
| # Normal mode - show hint to press button when word is ready | |
| elif len(session['letters']) >= 3 and current_word_en: | |
| response_display_ar = f"โ ุงูููู ุฉ ุฌุงูุฒุฉ!\n๐ก ุงุถุบุท 'ุฅุฑุณุงู ููุฐูุงุก ุงูุงุตุทูุงุนู' ููุชุญููู" | |
| elif session['letters']: | |
| response_display_ar = f"โณ ุงุณุชู ุฑ ูู ุงูุฅุดุงุฑุงุช... ({len(session['letters'])} ุญุฑูู ุญุชู ุงูุขู)" | |
| else: | |
| response_display_ar = "๐ฅ ุงุจุฏุฃ ุจุฅุธูุงุฑ ุฅุดุงุฑุงุช ุงููุฏ" | |
| return detected_info_ar, word_display_ar, translation_display_ar, response_display_ar | |
| except Exception as e: | |
| logger.error(f"Error processing sign: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return f"โ ุฎุทุฃ: {str(e)}", "", "", "ุงูุฑุฌุงุก ุงูู ุญุงููุฉ ู ุฑุฉ ุฃุฎุฑู" | |
| def process_doctor_audio(audio, session_id="default"): | |
| """Process doctor's audio input""" | |
| try: | |
| if audio is None: | |
| return "โ No audio provided", "" | |
| # Convert audio to text | |
| doctor_text = speech_processor.speech_to_text(audio) | |
| logger.info(f"๐ค Doctor said: {doctor_text}") | |
| # Get medical agent | |
| medical_agent_instance = get_medical_agent() | |
| patient_question = medical_agent_instance.process_doctor_input(doctor_text) | |
| # Translate to Arabic | |
| translator_instance = get_translator() | |
| arabic_question = translator_instance.en_to_ar(patient_question) | |
| return f"๐ค You said: {doctor_text}", f"โ Question for patient: {arabic_question}" | |
| except Exception as e: | |
| logger.error(f"Error processing audio: {e}") | |
| return f"โ Error: {str(e)}", "" | |
| def reset_session(session_id="default"): | |
| """Reset conversation session and clear accumulated letters/words""" | |
| if session_id in sessions: | |
| del sessions[session_id] | |
| return "๐ ุชู ุฅุนุงุฏุฉ ุชุนููู ุงูุฌูุณุฉ ุจูุฌุงุญ!\n\nโ Session reset - all letters and words cleared!" | |
| def complete_word(session_id="default"): | |
| """Send word to HuatuoGPT, pause detection, show LLM question""" | |
| if session_id not in sessions: | |
| return "โ ๏ธ ูุง ุชูุฌุฏ ุฌูุณุฉ - ุงุจุฏุฃ ุจุฅุธูุงุฑ ุฅุดุงุฑุงุช" | |
| session = sessions[session_id] | |
| # If already waiting for answer, this button saves answer and resumes detection | |
| if session['waiting_for_answer']: | |
| # Save answer word if there are letters | |
| if session['letters']: | |
| answer_word_ar = ''.join(session['letters']) | |
| session['words'].append(answer_word_ar) | |
| result_msg = f"โ ุชู ุญูุธ ุงูุฅุฌุงุจุฉ: {answer_word_ar}" | |
| else: | |
| result_msg = "โ ๏ธ ูู ูุชู ุฅุฏุฎุงู ุฅุฌุงุจุฉ" | |
| # Clear and resume detection | |
| session['letters'] = [] | |
| session['last_letter'] = None | |
| session['letter_stable_count'] = 0 | |
| session['waiting_for_answer'] = False | |
| session['current_question'] = '' | |
| return f"{result_msg}\n\n๐ ุงููุดู ู ุณุชุฃูู - ุฌุงูุฒ ููููู ุฉ ุงูุชุงููุฉ!" | |
| # Check if we have letters to send | |
| if not session['letters']: | |
| return "โ ๏ธ ูุง ุชูุฌุฏ ุฃุญุฑู - ุฃุธูุฑ ุฅุดุงุฑุงุช ุฃููุงู" | |
| # First time: Send word to LLM | |
| current_word_ar = ''.join(session['letters']) | |
| session['words'].append(current_word_ar) | |
| # Translate to English | |
| translator_instance = get_translator() | |
| current_word_en = translator_instance.ar_to_en(current_word_ar) | |
| logger.info(f"๐ค Sending to HuatuoGPT: {current_word_ar} โ {current_word_en}") | |
| # Get medical response from HuatuoGPT | |
| medical_agent_instance = get_medical_agent() | |
| agent_response = medical_agent_instance.process_input( | |
| current_word_en, | |
| session_id=session_id | |
| ) | |
| # Translate response to Arabic | |
| arabic_medical_response = translator_instance.en_to_ar(agent_response['response']) | |
| # Update session | |
| session['question_count'] = agent_response['question_count'] | |
| session['history'].append(f"ุงูู ุฑูุถ: {current_word_ar} ({current_word_en})") | |
| session['history'].append(f"ุงูุทุจูุจ: {arabic_medical_response}") | |
| # Pause detection and store question | |
| session['waiting_for_answer'] = True | |
| session['current_question'] = f"๐จโโ๏ธ ุงูุทุจูุจ ({agent_response['question_count']}/3):\n{arabic_medical_response}\n\nโธ๏ธ ุงููุดู ู ุชููู - ุฃุธูุฑ ุฅุฌุงุจุชู ุซู ุงุถุบุท ุงูุฒุฑ ู ุฑุฉ ุฃุฎุฑู" | |
| # DON'T clear letters yet - keep them visible | |
| # Only clear last_letter tracking for new word detection | |
| session['last_letter'] = None | |
| session['letter_stable_count'] = 0 | |
| return f"โ ุชู ุฅุฑุณุงู: {current_word_ar} โ {current_word_en}\n\n๐ค HuatuoGPT ูุญูู...\n\n{session['current_question']}" | |
| def create_interface(): | |
| """Create Gradio interface""" | |
| with gr.Blocks(title="Arabic Sign Language Medical Interpreter") as app: | |
| gr.Markdown( | |
| """ | |
| # ๐ฅ Arabic Sign Language Medical Interpreter | |
| This system helps deaf patients communicate with doctors using Arabic sign language. | |
| ## ๐ฏ How to use: | |
| 1. **Patient**: Show Arabic sign language to the camera | |
| 2. **System**: Detects signs, translates, and provides medical questions | |
| 3. **Doctor**: Can also speak questions which will be converted for the patient | |
| """ | |
| ) | |
| session_id = gr.State(value="default_session") | |
| with gr.Tab("๐น Sign Language Detection"): | |
| gr.Markdown(""" | |
| ### ๐ฅ Real-Time Sign Detection / ุงููุดู ูู ุงูููุช ุงููุนูู | |
| **๐ Workflow / ุณูุฑ ุงูุนู ู:** | |
| 1. **YOLO detects** - ุฃุธูุฑ ุฅุดุงุฑุงุช ูุจูุงุก ููู ุฉ (ุซุงููุฉ ููู ุญุฑู) | |
| 2. **Press button** - ุงุถุบุท ุงูุฒุฑ ูุฅุฑุณุงู ุงูููู ุฉ | |
| 3. **YOLO pauses** - ูุชููู ุงููุดู ุชููุงุฆูุงู | |
| 4. **HuatuoGPT analyzes** - ุงูุฐูุงุก ูุญูู ููุณุฃู | |
| 5. **Question shown** - ุงูุณุคุงู ูุธูุฑ (ูุง ูุฎุชูู!) | |
| 6. **Show answer signs** - ุฃุธูุฑ ุฅุฌุงุจุชู | |
| 7. **Press button again** - ููู ุชุงุจุนุฉ | |
| ๐จ **Detection pauses during LLM questions - question stays visible!** | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| image_input = gr.Image( | |
| sources=["webcam"], | |
| type="pil", | |
| label="๐น Live Camera Feed / ูุงู ูุฑุง ู ุจุงุดุฑุฉ", | |
| streaming=True | |
| ) | |
| with gr.Row(): | |
| complete_word_btn = gr.Button("๐ค ุฅุฑุณุงู ููุฐูุงุก (Send to AI)", variant="primary", size="lg") | |
| clear_btn = gr.Button("๐ ู ุณุญ (Clear All)", variant="secondary") | |
| with gr.Column(): | |
| detected_output = gr.Textbox( | |
| label="โ ูุชุงุฆุฌ ุงููุดู / Detection Results", | |
| lines=3, | |
| placeholder="ุณุชุธูุฑ ุงูุญุฑูู ุงูู ูุชุดูุฉ ููุง / Detected letters will appear here..." | |
| ) | |
| arabic_output = gr.Textbox( | |
| label="๐ค ุงูููู ุฉ ุงูุนุฑุจูุฉ / Arabic Word", | |
| lines=2, | |
| placeholder="ุงูููู ุฉ ุงูู ุชุฑุงูู ุฉ / Accumulated word..." | |
| ) | |
| english_output = gr.Textbox( | |
| label="๐ ุงูุชุฑุฌู ุฉ / Translation", | |
| lines=2, | |
| placeholder="ุงูุชุฑุฌู ุฉ ุงูุฅูุฌููุฒูุฉ / English translation..." | |
| ) | |
| response_output = gr.Textbox( | |
| label="๐จโโ๏ธ ุงุณุชุฌุงุจุฉ ุงูุทุจูุจ / Medical AI Response", | |
| lines=5, | |
| placeholder="ุณุชุธูุฑ ุงูุฃุณุฆูุฉ ุงูุทุจูุฉ ููุง / Medical questions will appear here..." | |
| ) | |
| word_status = gr.Textbox( | |
| label="๐ ุญุงูุฉ ุงูููู ุฉ / Word Status", | |
| lines=2, | |
| placeholder="Word completion status..." | |
| ) | |
| gr.Markdown(""" | |
| ### ๐ก Tips for Better Detection / ูุตุงุฆุญ ูููุดู ุงูุฃูุถู: | |
| - **ุฅุถุงุกุฉ ุฌูุฏุฉ / Good Lighting**: Ensure your hands are well-lit | |
| - **ุฎูููุฉ ูุงุถุญุฉ / Clear Background**: Use a plain background | |
| - **ูุถุน ุงููุฏ / Hand Position**: Keep hands centered in view | |
| - **ูุถูุญ ุงูุฅุดุงุฑุฉ / Sign Clarity**: Make distinct, clear signs | |
| - **ุงูู ุณุงูุฉ / Distance**: Comfortable distance from camera | |
| - **ุซุจุงุช / Stability**: Hold each sign steady for ~1 second (wait for ๐ข๐ข) | |
| - **ูุง ุชุนูุฏ ุชุนููู / Don't Reset**: Move hand away between letters - word stays! | |
| """) | |
| # Auto-streaming detection - no manual button click needed | |
| image_input.stream( | |
| fn=process_sign_language_stream, | |
| inputs=[image_input, session_id], | |
| outputs=[detected_output, arabic_output, english_output, response_output], | |
| stream_every=0.5 # Process every 0.5 seconds | |
| ) | |
| complete_word_btn.click( | |
| fn=complete_word, | |
| inputs=[session_id], | |
| outputs=[word_status] | |
| ) | |
| def clear_all(session_id): | |
| if session_id in sessions: | |
| sessions[session_id]['letters'] = [] | |
| sessions[session_id]['last_letter'] = None | |
| sessions[session_id]['letter_stable_count'] = 0 | |
| return "", "", "", "", "" | |
| clear_btn.click( | |
| fn=clear_all, | |
| inputs=[session_id], | |
| outputs=[detected_output, arabic_output, english_output, response_output, word_status] | |
| ) | |
| with gr.Tab("๐ค Doctor's Voice Input"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio( | |
| sources=["microphone"], | |
| type="filepath", | |
| label="Doctor's Voice" | |
| ) | |
| audio_btn = gr.Button("๐ค Process Audio", variant="primary", size="lg") | |
| with gr.Column(): | |
| doctor_text_output = gr.Textbox(label="๐ค Transcribed Text", lines=3) | |
| question_output = gr.Textbox(label="โ Question for Patient (Arabic)", lines=3) | |
| audio_btn.click( | |
| fn=process_doctor_audio, | |
| inputs=[audio_input, session_id], | |
| outputs=[doctor_text_output, question_output] | |
| ) | |
| with gr.Tab("โน๏ธ System Info"): | |
| gr.Markdown( | |
| """ | |
| ## ๐ System Features: | |
| - **YOLO-based** Arabic sign language detection | |
| - **Real-time** translation (Arabic โ English) | |
| - **Medical AI** for intelligent questioning | |
| - **ZeroGPU** optimization for efficient processing | |
| ## ๐ง Technical Stack: | |
| - YOLOv8 for sign detection | |
| - Helsinki-NLP for translation | |
| - Whisper for speech recognition | |
| - gTTS for text-to-speech | |
| ## ๐ก Tips: | |
| - Ensure good lighting for better detection | |
| - Make clear, distinct sign gestures | |
| - Speak clearly into the microphone | |
| """ | |
| ) | |
| reset_btn = gr.Button("๐ Reset Session", variant="secondary") | |
| reset_output = gr.Textbox(label="Status", lines=1) | |
| reset_btn.click( | |
| fn=reset_session, | |
| inputs=[session_id], | |
| outputs=[reset_output] | |
| ) | |
| gr.Markdown( | |
| """ | |
| --- | |
| Built with โค๏ธ for accessible healthcare communication | |
| """ | |
| ) | |
| return app | |
| # Initialize and launch | |
| if __name__ == "__main__": | |
| logger.info("๐ Starting Arabic Sign Language Medical Interpreter...") | |
| # Setup environment | |
| setup_environment() | |
| # Initialize models | |
| initialize_models() | |
| # Create and launch interface | |
| app = create_interface() | |
| app.queue() | |
| app.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False | |
| ) | |