Spaces:

Mr-HASSAN
/

arabic-sign-language-yolo

Sleeping

App Files Files Community

Mr-HASSAN commited on 18 days ago

Commit

45b7888

verified ·

1 Parent(s): 62de988

Upload 9 files

Browse files

Files changed (9) hide show

app.py +180 -0
best.pt +3 -0
requirements.txt +29 -0
utils/__init__.py +0 -0
utils/detector.py +95 -0
utils/medical_agent.py +126 -0
utils/sign_generator.py +10 -0
utils/speech.py +29 -0
utils/translator.py +33 -0

app.py ADDED Viewed

	@@ -0,0 +1,180 @@

+from flask import Flask, request, jsonify
+from flask_cors import CORS
+import base64
+import io
+import cv2
+import numpy as np
+import tempfile
+import os
+from PIL import Image
+import logging
+from utils.detector import ArabicSignDetector
+from utils.translator import MedicalTranslator
+from utils.medical_agent import MedicalAgent
+from utils.speech import SpeechProcessor
+from utils.sign_generator import SignGenerator
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+app = Flask(__name__)
+CORS(app)
+# Global instances
+detector = None
+translator = None
+medical_agent = None
+speech_processor = None
+sign_generator = None
+def initialize_models():
+    global detector, translator, medical_agent, speech_processor, sign_generator
+    logger.info("🔄 Initializing models...")
+    try:
+        detector = ArabicSignDetector()
+        translator = MedicalTranslator()
+        medical_agent = MedicalAgent()
+        speech_processor = SpeechProcessor()
+        sign_generator = SignGenerator()
+        logger.info("🎉 All models initialized!")
+    except Exception as e:
+        logger.error(f"❌ Initialization failed: {e}")
+        raise
+@app.route('/')
+def index():
+    return "Medical Agent API is running!"
+@app.route('/health')
+def health_check():
+    return jsonify({"status": "healthy"})
+@app.route('/api/process-sign', methods=['POST'])
+def process_sign_language():
+    try:
+        data = request.json
+        image_data = data.get('image')
+        if image_data.startswith('data:image'):
+            image_data = image_data.split(',')[1]
+        image_bytes = base64.b64decode(image_data)
+        image = Image.open(io.BytesIO(image_bytes))
+        image_np = np.array(image)
+        if len(image_np.shape) == 3 and image_np.shape[2] == 3:
+            image_np = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
+        # Detect Arabic letters (NO medical mapping)
+        detection_result = detector.detect_letters(image_np)
+        if not detection_result['success']:
+            return jsonify({
+                'error': 'No letters detected',
+                'arabic_text': '',
+                'english_text': ''
+            })
+        # Get the actual Arabic text from letters
+        arabic_text = detection_result['arabic_text']
+        # Translate to English for medical agent
+        english_text = translator.ar_to_en(arabic_text)
+        # Process with medical agent (agent understands actual text)
+        agent_response = medical_agent.process_input(
+            english_text,
+            session_id=data.get('session_id', 'default')
+        )
+        # Translate response back to Arabic
+        arabic_response = translator.en_to_ar(agent_response['response'])
+        # Generate sign animation
+        sign_data = sign_generator.text_to_sign(arabic_response)
+        return jsonify({
+            'success': True,
+            'detected_letters': detection_result['letters'],
+            'arabic_text': arabic_text,
+            'english_translation': english_text,
+            'agent_response_english': agent_response['response'],
+            'agent_response_arabic': arabic_response,
+            'sign_data': sign_data,
+            'question_count': agent_response.get('question_count', 0),
+            'conversation_state': agent_response.get('state', 'questioning')
+        })
+    except Exception as e:
+        logger.error(f"Error in process-sign: {e}")
+        return jsonify({'error': str(e)}), 500
+@app.route('/api/process-audio', methods=['POST'])
+def process_audio():
+    try:
+        data = request.json
+        audio_data = data.get('audio')
+        if audio_data.startswith('data:audio'):
+            audio_data = audio_data.split(',')[1]
+        audio_bytes = base64.b64decode(audio_data)
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as f:
+            f.write(audio_bytes)
+            audio_path = f.name
+        doctor_text = speech_processor.speech_to_text(audio_path)
+        patient_question = medical_agent.process_doctor_input(doctor_text)
+        arabic_question = translator.en_to_ar(patient_question)
+        sign_data = sign_generator.text_to_sign(arabic_question)
+        os.unlink(audio_path)
+        return jsonify({
+            'success': True,
+            'doctor_text': doctor_text,
+            'patient_question_english': patient_question,
+            'patient_question_arabic': arabic_question,
+            'sign_data': sign_data
+        })
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+@app.route('/api/text-to-speech', methods=['POST'])
+def text_to_speech():
+    try:
+        data = request.json
+        text = data.get('text')
+        audio_path = speech_processor.text_to_speech(text, "summary")
+        with open(audio_path, 'rb') as f:
+            audio_bytes = f.read()
+        audio_b64 = base64.b64encode(audio_bytes).decode('utf-8')
+        return jsonify({
+            'success': True,
+            'audio': f"data:audio/wav;base64,{audio_b64}"
+        })
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+if __name__ == '__main__':
+    initialize_models()
+    app.run(host='0.0.0.0', port=7860, debug=True)

best.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1126bf72b6b69eb9e608ad6132a9a9411c37854e1b08f5bb6ccbe8f6f0418c0
+size 52045963

requirements.txt ADDED Viewed

	@@ -0,0 +1,29 @@

+# Core
+torch>=2.0.0
+torchvision>=0.15.0
+ultralytics>=8.0.0
+transformers>=4.30.0
+accelerate>=0.20.0
+# Audio
+speechbrain>=0.5.15
+librosa>=0.10.0
+pydub>=0.25.1
+soundfile>=0.12.0
+# Vision
+opencv-python>=4.8.0
+mediapipe>=0.10.0
+Pillow>=10.0.0
+# Web & API
+flask>=2.3.0
+flask-cors>=4.0.0
+flask-socketio>=5.3.0
+gradio>=3.40.0
+requests>=2.31.0
+# Utilities
+numpy>=1.24.0
+pydantic>=2.0.0
+python-multipart>=0.0.6

utils/__init__.py ADDED Viewed

File without changes

utils/detector.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import cv2
+import numpy as np
+from ultralytics import YOLO
+import torch
+from typing import Dict, List, Any
+class ArabicSignDetector:
+    def __init__(self, model_path: str = 'best.pt'):
+        self.model = YOLO(model_path)
+        self.confidence_threshold = 0.5
+    def detect_letters(self, image: np.ndarray) -> Dict[str, Any]:
+        """Detect Arabic letters and form text (NO medical mapping)"""
+        try:
+            results = self.model(image, verbose=False)
+            detected_letters = []
+            confidences = []
+            bboxes = []
+            for result in results:
+                if hasattr(result, 'boxes') and result.boxes is not None:
+                    boxes = result.boxes
+                    for i in range(len(boxes.cls)):
+                        class_id = int(boxes.cls[i])
+                        confidence = float(boxes.conf[i])
+                        if confidence > self.confidence_threshold:
+                            letter = self.model.names.get(class_id, "")
+                            detected_letters.append(letter)
+                            confidences.append(confidence)
+                            # Get bounding box coordinates
+                            if hasattr(boxes, 'xyxy'):
+                                bbox = boxes.xyxy[i].cpu().numpy()
+                                bboxes.append(bbox.tolist())
+            # Sort letters from right to left for Arabic
+            sorted_letters = self._sort_arabic_letters(detected_letters, bboxes)
+            # Form Arabic text from detected letters
+            arabic_text = self._form_arabic_text(sorted_letters)
+            return {
+                'success': len(detected_letters) > 0,
+                'arabic_text': arabic_text,
+                'letters': detected_letters,
+                'sorted_letters': sorted_letters,
+                'confidences': confidences,
+                'bboxes': bboxes,
+                'total_detections': len(detected_letters)
+            }
+        except Exception as e:
+            return {
+                'success': False,
+                'error': str(e),
+                'arabic_text': '',
+                'letters': [],
+                'sorted_letters': [],
+                'confidences': [],
+                'bboxes': []
+            }
+    def _sort_arabic_letters(self, letters: List[str], bboxes: List) -> List[str]:
+        """Sort Arabic letters from right to left based on x-coordinate"""
+        if not bboxes or len(letters) != len(bboxes):
+            return letters
+        # Get x-coordinates (right side of bbox for Arabic RTL)
+        letter_positions = []
+        for i, bbox in enumerate(bboxes):
+            x_right = bbox[2]  # x2 coordinate (right side)
+            letter_positions.append((x_right, letters[i]))
+        # Sort by x-coordinate descending (right to left)
+        letter_positions.sort(key=lambda x: x[0], reverse=True)
+        return [letter for _, letter in letter_positions]
+    def _form_arabic_text(self, letters: List[str]) -> str:
+        """Form Arabic text from detected letters"""
+        if not letters:
+            return ""
+        # Simply join the letters - the agent will understand the actual words
+        arabic_text = "".join(letters)
+        # Add basic Arabic sentence structure if it makes sense
+        if len(letters) >= 2:
+            # For demo, if we have multiple letters, assume it's a word
+            return arabic_text
+        else:
+            return arabic_text

utils/medical_agent.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import json
+from typing import Dict, Any
+from transformers import pipeline
+import torch
+class MedicalAgent:
+    def __init__(self):
+        self.llm = self._load_llm()
+        self.conversation_history = []
+        self.question_count = 0
+        self.max_questions = 3
+    def _load_llm(self):
+        try:
+            return pipeline(
+                "text-generation",
+                model="microsoft/DialoGPT-medium",
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                device_map="auto"
+            )
+        except:
+            return None
+    def process_input(self, english_text: str, session_id: str) -> Dict[str, Any]:
+        self.conversation_history.append(f"Patient: {english_text}")
+        if self.question_count >= self.max_questions:
+            return self._generate_summary()
+        # Agent analyzes the ACTUAL English text from translation
+        analysis = self._analyze_input(english_text)
+        if analysis["needs_follow_up"]:
+            question = self._generate_question(analysis, english_text)
+            self.question_count += 1
+            return {
+                'response': question,
+                'question_count': self.question_count,
+                'state': 'questioning'
+            }
+        else:
+            return self._generate_summary()
+    def _analyze_input(self, text: str) -> Dict[str, Any]:
+        """Analyze the actual translated text"""
+        prompt = f"""
+        Patient says: "{text}"
+        As a medical assistant, analyze this and decide:
+        1. Do we need follow-up questions? (true/false)
+        2. What specific information is missing?
+        3. What would be good follow-up questions?
+        Respond in JSON format only:
+        {{
+            "needs_follow_up": true/false,
+            "missing_info": ["item1", "item2"],
+            "possible_questions": ["question1", "question2"]
+        }}
+        """
+        try:
+            if self.llm:
+                response = self.llm(prompt, max_length=300, do_sample=True)[0]['generated_text']
+                # Extract JSON from response
+                start = response.find('{')
+                end = response.find('}') + 1
+                if start != -1 and end != -1:
+                    json_str = response[start:end]
+                    return json.loads(json_str)
+        except:
+            pass
+        # Fallback: always ask follow-up unless we have enough info
+        return {
+            "needs_follow_up": self.question_count < 2,
+            "missing_info": ["more details", "duration", "severity"],
+            "possible_questions": ["Can you describe more?", "How long?", "How severe?"]
+        }
+    def _generate_question(self, analysis: Dict, original_text: str) -> str:
+        """Generate follow-up question based on actual content"""
+        questions = analysis.get("possible_questions", ["Can you tell me more?"])
+        question = questions[0]
+        # Limit to 5 words
+        words = question.split()[:5]
+        return " ".join(words)
+    def _generate_summary(self) -> Dict[str, Any]:
+        conversation = "\n".join(self.conversation_history[-3:])
+        prompt = f"""
+        Patient conversation summary:
+        {conversation}
+        Create a brief medical summary for a doctor in 2-3 sentences.
+        Focus on main symptoms and key information.
+        """
+        if self.llm:
+            summary = self.llm(prompt, max_length=150)[0]['generated_text']
+        else:
+            summary = f"Summary: {conversation}. Patient needs medical consultation."
+        return {
+            'response': summary,
+            'question_count': self.question_count,
+            'state': 'summary'
+        }
+    def process_doctor_input(self, doctor_text: str) -> str:
+        prompt = f"""
+        Doctor asks: "{doctor_text}"
+        Rephrase this as a simple, clear question for the patient.
+        Use maximum 5 words and easy language.
+        """
+        if self.llm:
+            response = self.llm(prompt, max_length=50)[0]['generated_text']
+            return response.strip()
+        else:
+            return "Can you describe more?"

utils/sign_generator.py ADDED Viewed

	@@ -0,0 +1,10 @@

+class SignGenerator:
+    def __init__(self):
+        pass
+    def text_to_sign(self, text: str) -> dict:
+        return {
+            "animation_data": f"Sign for: {text}",
+            "duration": 3.0,
+            "type": "placeholder"
+        }

utils/speech.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from transformers import pipeline
+import torch
+import tempfile
+import os
+class SpeechProcessor:
+    def __init__(self):
+        self.stt = pipeline(
+            "automatic-speech-recognition",
+            model="facebook/wav2vec2-base-960h",
+            device=0 if torch.cuda.is_available() else -1
+        )
+        # For demo, we'll use a simple TTS approach
+        self.tts_available = False
+    def speech_to_text(self, audio_path: str) -> str:
+        try:
+            result = self.stt(audio_path)
+            return result.get('text', '')
+        except:
+            return "Audio processed"
+    def text_to_speech(self, text: str, filename: str) -> str:
+        # For demo, create a placeholder audio file
+        output_path = f"/tmp/{filename}.wav"
+        # In production, integrate with real TTS
+        return output_path

utils/translator.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from transformers import MarianMTModel, MarianTokenizer
+import torch
+class MedicalTranslator:
+    def __init__(self):
+        self.ar_en_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ar-en")
+        self.ar_en_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-ar-en")
+        self.en_ar_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
+        self.en_ar_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
+    def ar_to_en(self, text: str) -> str:
+        """Translate Arabic to English"""
+        if not text.strip():
+            return ""
+        inputs = self.ar_en_tokenizer(text, return_tensors="pt", padding=True, max_length=512, truncation=True)
+        with torch.no_grad():
+            outputs = self.ar_en_model.generate(**inputs)
+        return self.ar_en_tokenizer.decode(outputs[0], skip_special_tokens=True)
+    def en_to_ar(self, text: str) -> str:
+        """Translate English to Arabic"""
+        if not text.strip():
+            return ""
+        inputs = self.en_ar_tokenizer(text, return_tensors="pt", padding=True, max_length=512, truncation=True)
+        with torch.no_grad():
+            outputs = self.en_ar_model.generate(**inputs)
+        return self.en_ar_tokenizer.decode(outputs[0], skip_special_tokens=True)