Mr-HASSAN commited on
Commit
45b7888
·
verified ·
1 Parent(s): 62de988

Upload 9 files

Browse files
app.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ from flask_cors import CORS
3
+ import base64
4
+ import io
5
+ import cv2
6
+ import numpy as np
7
+ import tempfile
8
+ import os
9
+ from PIL import Image
10
+ import logging
11
+
12
+ from utils.detector import ArabicSignDetector
13
+ from utils.translator import MedicalTranslator
14
+ from utils.medical_agent import MedicalAgent
15
+ from utils.speech import SpeechProcessor
16
+ from utils.sign_generator import SignGenerator
17
+
18
+ logging.basicConfig(level=logging.INFO)
19
+ logger = logging.getLogger(__name__)
20
+
21
+ app = Flask(__name__)
22
+ CORS(app)
23
+
24
+ # Global instances
25
+ detector = None
26
+ translator = None
27
+ medical_agent = None
28
+ speech_processor = None
29
+ sign_generator = None
30
+
31
+
32
+ def initialize_models():
33
+ global detector, translator, medical_agent, speech_processor, sign_generator
34
+
35
+ logger.info("🔄 Initializing models...")
36
+
37
+ try:
38
+ detector = ArabicSignDetector()
39
+ translator = MedicalTranslator()
40
+ medical_agent = MedicalAgent()
41
+ speech_processor = SpeechProcessor()
42
+ sign_generator = SignGenerator()
43
+
44
+ logger.info("🎉 All models initialized!")
45
+
46
+ except Exception as e:
47
+ logger.error(f"❌ Initialization failed: {e}")
48
+ raise
49
+
50
+
51
+ @app.route('/')
52
+ def index():
53
+ return "Medical Agent API is running!"
54
+
55
+
56
+ @app.route('/health')
57
+ def health_check():
58
+ return jsonify({"status": "healthy"})
59
+
60
+
61
+ @app.route('/api/process-sign', methods=['POST'])
62
+ def process_sign_language():
63
+ try:
64
+ data = request.json
65
+ image_data = data.get('image')
66
+
67
+ if image_data.startswith('data:image'):
68
+ image_data = image_data.split(',')[1]
69
+
70
+ image_bytes = base64.b64decode(image_data)
71
+ image = Image.open(io.BytesIO(image_bytes))
72
+ image_np = np.array(image)
73
+
74
+ if len(image_np.shape) == 3 and image_np.shape[2] == 3:
75
+ image_np = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
76
+
77
+ # Detect Arabic letters (NO medical mapping)
78
+ detection_result = detector.detect_letters(image_np)
79
+
80
+ if not detection_result['success']:
81
+ return jsonify({
82
+ 'error': 'No letters detected',
83
+ 'arabic_text': '',
84
+ 'english_text': ''
85
+ })
86
+
87
+ # Get the actual Arabic text from letters
88
+ arabic_text = detection_result['arabic_text']
89
+
90
+ # Translate to English for medical agent
91
+ english_text = translator.ar_to_en(arabic_text)
92
+
93
+ # Process with medical agent (agent understands actual text)
94
+ agent_response = medical_agent.process_input(
95
+ english_text,
96
+ session_id=data.get('session_id', 'default')
97
+ )
98
+
99
+ # Translate response back to Arabic
100
+ arabic_response = translator.en_to_ar(agent_response['response'])
101
+
102
+ # Generate sign animation
103
+ sign_data = sign_generator.text_to_sign(arabic_response)
104
+
105
+ return jsonify({
106
+ 'success': True,
107
+ 'detected_letters': detection_result['letters'],
108
+ 'arabic_text': arabic_text,
109
+ 'english_translation': english_text,
110
+ 'agent_response_english': agent_response['response'],
111
+ 'agent_response_arabic': arabic_response,
112
+ 'sign_data': sign_data,
113
+ 'question_count': agent_response.get('question_count', 0),
114
+ 'conversation_state': agent_response.get('state', 'questioning')
115
+ })
116
+
117
+ except Exception as e:
118
+ logger.error(f"Error in process-sign: {e}")
119
+ return jsonify({'error': str(e)}), 500
120
+
121
+
122
+ @app.route('/api/process-audio', methods=['POST'])
123
+ def process_audio():
124
+ try:
125
+ data = request.json
126
+ audio_data = data.get('audio')
127
+
128
+ if audio_data.startswith('data:audio'):
129
+ audio_data = audio_data.split(',')[1]
130
+
131
+ audio_bytes = base64.b64decode(audio_data)
132
+
133
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as f:
134
+ f.write(audio_bytes)
135
+ audio_path = f.name
136
+
137
+ doctor_text = speech_processor.speech_to_text(audio_path)
138
+ patient_question = medical_agent.process_doctor_input(doctor_text)
139
+ arabic_question = translator.en_to_ar(patient_question)
140
+ sign_data = sign_generator.text_to_sign(arabic_question)
141
+
142
+ os.unlink(audio_path)
143
+
144
+ return jsonify({
145
+ 'success': True,
146
+ 'doctor_text': doctor_text,
147
+ 'patient_question_english': patient_question,
148
+ 'patient_question_arabic': arabic_question,
149
+ 'sign_data': sign_data
150
+ })
151
+
152
+ except Exception as e:
153
+ return jsonify({'error': str(e)}), 500
154
+
155
+
156
+ @app.route('/api/text-to-speech', methods=['POST'])
157
+ def text_to_speech():
158
+ try:
159
+ data = request.json
160
+ text = data.get('text')
161
+
162
+ audio_path = speech_processor.text_to_speech(text, "summary")
163
+
164
+ with open(audio_path, 'rb') as f:
165
+ audio_bytes = f.read()
166
+
167
+ audio_b64 = base64.b64encode(audio_bytes).decode('utf-8')
168
+
169
+ return jsonify({
170
+ 'success': True,
171
+ 'audio': f"data:audio/wav;base64,{audio_b64}"
172
+ })
173
+
174
+ except Exception as e:
175
+ return jsonify({'error': str(e)}), 500
176
+
177
+
178
+ if __name__ == '__main__':
179
+ initialize_models()
180
+ app.run(host='0.0.0.0', port=7860, debug=True)
best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1126bf72b6b69eb9e608ad6132a9a9411c37854e1b08f5bb6ccbe8f6f0418c0
3
+ size 52045963
requirements.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core
2
+ torch>=2.0.0
3
+ torchvision>=0.15.0
4
+ ultralytics>=8.0.0
5
+ transformers>=4.30.0
6
+ accelerate>=0.20.0
7
+
8
+ # Audio
9
+ speechbrain>=0.5.15
10
+ librosa>=0.10.0
11
+ pydub>=0.25.1
12
+ soundfile>=0.12.0
13
+
14
+ # Vision
15
+ opencv-python>=4.8.0
16
+ mediapipe>=0.10.0
17
+ Pillow>=10.0.0
18
+
19
+ # Web & API
20
+ flask>=2.3.0
21
+ flask-cors>=4.0.0
22
+ flask-socketio>=5.3.0
23
+ gradio>=3.40.0
24
+ requests>=2.31.0
25
+
26
+ # Utilities
27
+ numpy>=1.24.0
28
+ pydantic>=2.0.0
29
+ python-multipart>=0.0.6
utils/__init__.py ADDED
File without changes
utils/detector.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ from ultralytics import YOLO
4
+ import torch
5
+ from typing import Dict, List, Any
6
+
7
+
8
+ class ArabicSignDetector:
9
+ def __init__(self, model_path: str = 'best.pt'):
10
+ self.model = YOLO(model_path)
11
+ self.confidence_threshold = 0.5
12
+
13
+ def detect_letters(self, image: np.ndarray) -> Dict[str, Any]:
14
+ """Detect Arabic letters and form text (NO medical mapping)"""
15
+ try:
16
+ results = self.model(image, verbose=False)
17
+
18
+ detected_letters = []
19
+ confidences = []
20
+ bboxes = []
21
+
22
+ for result in results:
23
+ if hasattr(result, 'boxes') and result.boxes is not None:
24
+ boxes = result.boxes
25
+ for i in range(len(boxes.cls)):
26
+ class_id = int(boxes.cls[i])
27
+ confidence = float(boxes.conf[i])
28
+
29
+ if confidence > self.confidence_threshold:
30
+ letter = self.model.names.get(class_id, "")
31
+ detected_letters.append(letter)
32
+ confidences.append(confidence)
33
+
34
+ # Get bounding box coordinates
35
+ if hasattr(boxes, 'xyxy'):
36
+ bbox = boxes.xyxy[i].cpu().numpy()
37
+ bboxes.append(bbox.tolist())
38
+
39
+ # Sort letters from right to left for Arabic
40
+ sorted_letters = self._sort_arabic_letters(detected_letters, bboxes)
41
+
42
+ # Form Arabic text from detected letters
43
+ arabic_text = self._form_arabic_text(sorted_letters)
44
+
45
+ return {
46
+ 'success': len(detected_letters) > 0,
47
+ 'arabic_text': arabic_text,
48
+ 'letters': detected_letters,
49
+ 'sorted_letters': sorted_letters,
50
+ 'confidences': confidences,
51
+ 'bboxes': bboxes,
52
+ 'total_detections': len(detected_letters)
53
+ }
54
+
55
+ except Exception as e:
56
+ return {
57
+ 'success': False,
58
+ 'error': str(e),
59
+ 'arabic_text': '',
60
+ 'letters': [],
61
+ 'sorted_letters': [],
62
+ 'confidences': [],
63
+ 'bboxes': []
64
+ }
65
+
66
+ def _sort_arabic_letters(self, letters: List[str], bboxes: List) -> List[str]:
67
+ """Sort Arabic letters from right to left based on x-coordinate"""
68
+ if not bboxes or len(letters) != len(bboxes):
69
+ return letters
70
+
71
+ # Get x-coordinates (right side of bbox for Arabic RTL)
72
+ letter_positions = []
73
+ for i, bbox in enumerate(bboxes):
74
+ x_right = bbox[2] # x2 coordinate (right side)
75
+ letter_positions.append((x_right, letters[i]))
76
+
77
+ # Sort by x-coordinate descending (right to left)
78
+ letter_positions.sort(key=lambda x: x[0], reverse=True)
79
+
80
+ return [letter for _, letter in letter_positions]
81
+
82
+ def _form_arabic_text(self, letters: List[str]) -> str:
83
+ """Form Arabic text from detected letters"""
84
+ if not letters:
85
+ return ""
86
+
87
+ # Simply join the letters - the agent will understand the actual words
88
+ arabic_text = "".join(letters)
89
+
90
+ # Add basic Arabic sentence structure if it makes sense
91
+ if len(letters) >= 2:
92
+ # For demo, if we have multiple letters, assume it's a word
93
+ return arabic_text
94
+ else:
95
+ return arabic_text
utils/medical_agent.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import Dict, Any
3
+ from transformers import pipeline
4
+ import torch
5
+
6
+
7
+ class MedicalAgent:
8
+ def __init__(self):
9
+ self.llm = self._load_llm()
10
+ self.conversation_history = []
11
+ self.question_count = 0
12
+ self.max_questions = 3
13
+
14
+ def _load_llm(self):
15
+ try:
16
+ return pipeline(
17
+ "text-generation",
18
+ model="microsoft/DialoGPT-medium",
19
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
20
+ device_map="auto"
21
+ )
22
+ except:
23
+ return None
24
+
25
+ def process_input(self, english_text: str, session_id: str) -> Dict[str, Any]:
26
+ self.conversation_history.append(f"Patient: {english_text}")
27
+
28
+ if self.question_count >= self.max_questions:
29
+ return self._generate_summary()
30
+
31
+ # Agent analyzes the ACTUAL English text from translation
32
+ analysis = self._analyze_input(english_text)
33
+
34
+ if analysis["needs_follow_up"]:
35
+ question = self._generate_question(analysis, english_text)
36
+ self.question_count += 1
37
+
38
+ return {
39
+ 'response': question,
40
+ 'question_count': self.question_count,
41
+ 'state': 'questioning'
42
+ }
43
+ else:
44
+ return self._generate_summary()
45
+
46
+ def _analyze_input(self, text: str) -> Dict[str, Any]:
47
+ """Analyze the actual translated text"""
48
+ prompt = f"""
49
+ Patient says: "{text}"
50
+
51
+ As a medical assistant, analyze this and decide:
52
+ 1. Do we need follow-up questions? (true/false)
53
+ 2. What specific information is missing?
54
+ 3. What would be good follow-up questions?
55
+
56
+ Respond in JSON format only:
57
+ {{
58
+ "needs_follow_up": true/false,
59
+ "missing_info": ["item1", "item2"],
60
+ "possible_questions": ["question1", "question2"]
61
+ }}
62
+ """
63
+
64
+ try:
65
+ if self.llm:
66
+ response = self.llm(prompt, max_length=300, do_sample=True)[0]['generated_text']
67
+ # Extract JSON from response
68
+ start = response.find('{')
69
+ end = response.find('}') + 1
70
+ if start != -1 and end != -1:
71
+ json_str = response[start:end]
72
+ return json.loads(json_str)
73
+ except:
74
+ pass
75
+
76
+ # Fallback: always ask follow-up unless we have enough info
77
+ return {
78
+ "needs_follow_up": self.question_count < 2,
79
+ "missing_info": ["more details", "duration", "severity"],
80
+ "possible_questions": ["Can you describe more?", "How long?", "How severe?"]
81
+ }
82
+
83
+ def _generate_question(self, analysis: Dict, original_text: str) -> str:
84
+ """Generate follow-up question based on actual content"""
85
+ questions = analysis.get("possible_questions", ["Can you tell me more?"])
86
+ question = questions[0]
87
+
88
+ # Limit to 5 words
89
+ words = question.split()[:5]
90
+ return " ".join(words)
91
+
92
+ def _generate_summary(self) -> Dict[str, Any]:
93
+ conversation = "\n".join(self.conversation_history[-3:])
94
+
95
+ prompt = f"""
96
+ Patient conversation summary:
97
+ {conversation}
98
+
99
+ Create a brief medical summary for a doctor in 2-3 sentences.
100
+ Focus on main symptoms and key information.
101
+ """
102
+
103
+ if self.llm:
104
+ summary = self.llm(prompt, max_length=150)[0]['generated_text']
105
+ else:
106
+ summary = f"Summary: {conversation}. Patient needs medical consultation."
107
+
108
+ return {
109
+ 'response': summary,
110
+ 'question_count': self.question_count,
111
+ 'state': 'summary'
112
+ }
113
+
114
+ def process_doctor_input(self, doctor_text: str) -> str:
115
+ prompt = f"""
116
+ Doctor asks: "{doctor_text}"
117
+
118
+ Rephrase this as a simple, clear question for the patient.
119
+ Use maximum 5 words and easy language.
120
+ """
121
+
122
+ if self.llm:
123
+ response = self.llm(prompt, max_length=50)[0]['generated_text']
124
+ return response.strip()
125
+ else:
126
+ return "Can you describe more?"
utils/sign_generator.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ class SignGenerator:
2
+ def __init__(self):
3
+ pass
4
+
5
+ def text_to_sign(self, text: str) -> dict:
6
+ return {
7
+ "animation_data": f"Sign for: {text}",
8
+ "duration": 3.0,
9
+ "type": "placeholder"
10
+ }
utils/speech.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import torch
3
+ import tempfile
4
+ import os
5
+
6
+
7
+ class SpeechProcessor:
8
+ def __init__(self):
9
+ self.stt = pipeline(
10
+ "automatic-speech-recognition",
11
+ model="facebook/wav2vec2-base-960h",
12
+ device=0 if torch.cuda.is_available() else -1
13
+ )
14
+
15
+ # For demo, we'll use a simple TTS approach
16
+ self.tts_available = False
17
+
18
+ def speech_to_text(self, audio_path: str) -> str:
19
+ try:
20
+ result = self.stt(audio_path)
21
+ return result.get('text', '')
22
+ except:
23
+ return "Audio processed"
24
+
25
+ def text_to_speech(self, text: str, filename: str) -> str:
26
+ # For demo, create a placeholder audio file
27
+ output_path = f"/tmp/{filename}.wav"
28
+ # In production, integrate with real TTS
29
+ return output_path
utils/translator.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import MarianMTModel, MarianTokenizer
2
+ import torch
3
+
4
+
5
+ class MedicalTranslator:
6
+ def __init__(self):
7
+ self.ar_en_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ar-en")
8
+ self.ar_en_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-ar-en")
9
+
10
+ self.en_ar_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
11
+ self.en_ar_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
12
+
13
+ def ar_to_en(self, text: str) -> str:
14
+ """Translate Arabic to English"""
15
+ if not text.strip():
16
+ return ""
17
+
18
+ inputs = self.ar_en_tokenizer(text, return_tensors="pt", padding=True, max_length=512, truncation=True)
19
+ with torch.no_grad():
20
+ outputs = self.ar_en_model.generate(**inputs)
21
+
22
+ return self.ar_en_tokenizer.decode(outputs[0], skip_special_tokens=True)
23
+
24
+ def en_to_ar(self, text: str) -> str:
25
+ """Translate English to Arabic"""
26
+ if not text.strip():
27
+ return ""
28
+
29
+ inputs = self.en_ar_tokenizer(text, return_tensors="pt", padding=True, max_length=512, truncation=True)
30
+ with torch.no_grad():
31
+ outputs = self.en_ar_model.generate(**inputs)
32
+
33
+ return self.en_ar_tokenizer.decode(outputs[0], skip_special_tokens=True)