Update app.py
Browse files
app.py
CHANGED
|
@@ -637,7 +637,8 @@ def extract_sources_from_search_context(search_context: str, user_language: str
|
|
| 637 |
found_files = set()
|
| 638 |
|
| 639 |
# Pattern to match "From filename.pdf (category):" or "From filename.docx (category):"
|
| 640 |
-
|
|
|
|
| 641 |
matches = re.findall(from_pattern, search_context)
|
| 642 |
|
| 643 |
for match in matches:
|
|
@@ -650,38 +651,47 @@ def extract_sources_from_search_context(search_context: str, user_language: str
|
|
| 650 |
if pdf in search_context:
|
| 651 |
found_files.add(pdf)
|
| 652 |
|
| 653 |
-
# Filter by user language preference
|
| 654 |
-
language_filtered_files =
|
| 655 |
|
|
|
|
| 656 |
for pdf in found_files:
|
| 657 |
# Determine document language from filename
|
| 658 |
doc_language = get_document_language(pdf)
|
| 659 |
|
| 660 |
# Language matching logic
|
|
|
|
| 661 |
if user_language == 'en' or user_language == 'singlish':
|
| 662 |
# English users can see all documents, but prefer English versions
|
| 663 |
if doc_language in ['en', 'english'] or user_language == 'singlish':
|
| 664 |
-
|
| 665 |
elif not any(f for f in found_files if get_document_language(f) in ['en', 'english']):
|
| 666 |
# If no English version available, show other languages
|
| 667 |
-
|
| 668 |
elif user_language == 'si' or user_language == 'sinhala':
|
| 669 |
# Sinhala users prefer Sinhala documents
|
| 670 |
if doc_language in ['si', 'sinhala']:
|
| 671 |
-
|
| 672 |
elif not any(f for f in found_files if get_document_language(f) in ['si', 'sinhala']):
|
| 673 |
# If no Sinhala version available, show other languages
|
| 674 |
-
|
| 675 |
elif user_language == 'ta' or user_language == 'tamil':
|
| 676 |
# Tamil users prefer Tamil documents
|
| 677 |
if doc_language in ['ta', 'tamil']:
|
| 678 |
-
|
| 679 |
elif not any(f for f in found_files if get_document_language(f) in ['ta', 'tamil']):
|
| 680 |
# If no Tamil version available, show other languages
|
| 681 |
-
|
| 682 |
else:
|
| 683 |
# Default: show all documents
|
| 684 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 685 |
|
| 686 |
# Convert to list with short names
|
| 687 |
for pdf in language_filtered_files:
|
|
|
|
| 637 |
found_files = set()
|
| 638 |
|
| 639 |
# Pattern to match "From filename.pdf (category):" or "From filename.docx (category):"
|
| 640 |
+
# Updated to handle assets/pdfs/ prefix and empty parentheses, and stop at the colon
|
| 641 |
+
from_pattern = r'From\s+assets/pdfs/([^:]+\.(?:pdf|docx))\s*\([^)]*\)'
|
| 642 |
matches = re.findall(from_pattern, search_context)
|
| 643 |
|
| 644 |
for match in matches:
|
|
|
|
| 651 |
if pdf in search_context:
|
| 652 |
found_files.add(pdf)
|
| 653 |
|
| 654 |
+
# Filter by user language preference and prioritize by relevance
|
| 655 |
+
language_filtered_files = []
|
| 656 |
|
| 657 |
+
# First, collect all language-appropriate documents
|
| 658 |
for pdf in found_files:
|
| 659 |
# Determine document language from filename
|
| 660 |
doc_language = get_document_language(pdf)
|
| 661 |
|
| 662 |
# Language matching logic
|
| 663 |
+
should_include = False
|
| 664 |
if user_language == 'en' or user_language == 'singlish':
|
| 665 |
# English users can see all documents, but prefer English versions
|
| 666 |
if doc_language in ['en', 'english'] or user_language == 'singlish':
|
| 667 |
+
should_include = True
|
| 668 |
elif not any(f for f in found_files if get_document_language(f) in ['en', 'english']):
|
| 669 |
# If no English version available, show other languages
|
| 670 |
+
should_include = True
|
| 671 |
elif user_language == 'si' or user_language == 'sinhala':
|
| 672 |
# Sinhala users prefer Sinhala documents
|
| 673 |
if doc_language in ['si', 'sinhala']:
|
| 674 |
+
should_include = True
|
| 675 |
elif not any(f for f in found_files if get_document_language(f) in ['si', 'sinhala']):
|
| 676 |
# If no Sinhala version available, show other languages
|
| 677 |
+
should_include = True
|
| 678 |
elif user_language == 'ta' or user_language == 'tamil':
|
| 679 |
# Tamil users prefer Tamil documents
|
| 680 |
if doc_language in ['ta', 'tamil']:
|
| 681 |
+
should_include = True
|
| 682 |
elif not any(f for f in found_files if get_document_language(f) in ['ta', 'tamil']):
|
| 683 |
# If no Tamil version available, show other languages
|
| 684 |
+
should_include = True
|
| 685 |
else:
|
| 686 |
# Default: show all documents
|
| 687 |
+
should_include = True
|
| 688 |
+
|
| 689 |
+
if should_include:
|
| 690 |
+
language_filtered_files.append(pdf)
|
| 691 |
+
|
| 692 |
+
# Prioritize by relevance - take only the first (most relevant) document
|
| 693 |
+
if language_filtered_files:
|
| 694 |
+
language_filtered_files = [language_filtered_files[0]]
|
| 695 |
|
| 696 |
# Convert to list with short names
|
| 697 |
for pdf in language_filtered_files:
|