danulr05 commited on
Commit
73092be
Β·
verified Β·
1 Parent(s): aa35137

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -7
app.py CHANGED
@@ -387,7 +387,63 @@ def create_agent(session_id: str) -> AgentExecutor:
387
  return agent_executor
388
 
389
  def get_short_document_name(filename: str) -> str:
390
- """Convert long document names to shorter, user-friendly names automatically"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  # Remove file extension
392
  name = filename.replace('.pdf', '').replace('.docx', '')
393
 
@@ -395,8 +451,8 @@ def get_short_document_name(filename: str) -> str:
395
  short_names = {
396
  '20241211_Econ_VRProposals_Budget2025_OnePagers': 'Budget 2025 One-Pagers',
397
  '20250813_Budget2026_Proposal_ExpandingIndustrialLand_En_F': 'Industrial Land Expansion (EN)',
398
- '20250813_Budget2026Proposal_MaternityLeaveBenefit_Raj_D01': 'Maternity Leave Benefits (Raj)',
399
- '20250813_Budget2026Proposal_RemovalOfTaxationOnEPF_Raj_F': 'EPF Tax Removal (Raj)',
400
  '20250825_Budget2026Proposal_MaternityLeaveBenefit_Sin_F': 'Maternity Leave Benefits (Sinhala)',
401
  '20250825_Budget2026Proposal_MaternityLeaveBenefit_Tam_F': 'Maternity Leave Benefits (Tamil)',
402
  '20250825_Budget2026Proposal_RemovalOfTaxationOnEPF_Sin_Final': 'EPF Tax Removal (Sinhala)',
@@ -490,8 +546,8 @@ def get_available_pdfs() -> List[str]:
490
  '20250908_Budget2026Proposal_Template.pdf'
491
  ]
492
 
493
- def extract_sources_from_search_context(search_context: str) -> List[Dict[str, str]]:
494
- """Extract source documents from search context with short names"""
495
  sources = []
496
 
497
  # Get dynamically available PDF files
@@ -515,8 +571,41 @@ def extract_sources_from_search_context(search_context: str) -> List[Dict[str, s
515
  if pdf in search_context:
516
  found_files.add(pdf)
517
 
518
- # Convert to list with short names
 
 
519
  for pdf in found_files:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
520
  sources.append({
521
  "filename": pdf,
522
  "short_name": get_short_document_name(pdf)
@@ -524,6 +613,22 @@ def extract_sources_from_search_context(search_context: str) -> List[Dict[str, s
524
 
525
  return sources
526
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
527
  def extract_sources_from_response(response: str) -> List[Dict[str, str]]:
528
  """Extract source documents mentioned in the response with short names (fallback method)"""
529
  sources = []
@@ -643,7 +748,7 @@ Please provide a helpful response:"""
643
  # No need to translate response - Gemini handles language matching automatically
644
 
645
  # Extract sources from search context (where the actual filenames are)
646
- sources = extract_sources_from_search_context(search_context)
647
 
648
  # Add messages to memory (store original user message for context)
649
  memory.chat_memory.add_user_message(user_message)
 
387
  return agent_executor
388
 
389
  def get_short_document_name(filename: str) -> str:
390
+ """
391
+ Convert long document names to shorter, user-friendly names automatically
392
+
393
+ SHORT NAME GENERATION GUIDE:
394
+ ===========================
395
+
396
+ 1. MANUAL MAPPING (Priority 1):
397
+ - Add entries to the 'short_names' dictionary for specific files
398
+ - Format: 'full_filename_without_extension': 'Short Display Name'
399
+ - Example: '20250813_Budget2026Proposal_MaternityLeaveBenefit_Raj_D01': 'Maternity Leave Benefits'
400
+
401
+ 2. AUTOMATIC PATTERN MATCHING (Priority 2):
402
+ - System automatically detects proposal types and languages
403
+ - Proposal Types Detected:
404
+ * MaternityLeaveBenefit/MaternityLeave β†’ "Maternity Leave Benefits"
405
+ * RemovalOfTaxationOnEPF/EPF β†’ "EPF Tax Removal"
406
+ * ExpandingIndustrialLand/IndustrialLand β†’ "Industrial Land Expansion"
407
+ * Budget2025/Budget2026 β†’ "Budget 2025/2026 Proposals"
408
+ * Template β†’ "Budget Template"
409
+ * OnePagers β†’ "Budget YYYY One-Pagers"
410
+
411
+ - Language Detection:
412
+ * _Sin_/_Sinhala_ β†’ "(Sinhala)"
413
+ * _Tam_/_Tamil_ β†’ "(Tamil)"
414
+ * _En_/_English_ β†’ "(EN)"
415
+ * _Raj_ β†’ No language suffix (treated as default/English)
416
+ * No language indicator β†’ No language suffix
417
+
418
+ 3. GENERIC FALLBACK (Priority 3):
419
+ - Removes date prefixes: 20250813_ β†’ ""
420
+ - Removes language suffixes: _Sin_, _Tam_, _Raj_, _En_, _F_, _Final_, _D01
421
+ - Removes budget prefixes: Budget2026Proposal_ β†’ ""
422
+ - Converts underscores to spaces: _ β†’ " "
423
+ - Capitalizes words: "maternity leave" β†’ "Maternity Leave"
424
+ - Limits length: Truncates to 37 chars + "..." if longer than 40
425
+
426
+ EXAMPLES:
427
+ =========
428
+ Input: "20250813_Budget2026Proposal_MaternityLeaveBenefit_Sin_F.pdf"
429
+ Output: "Maternity Leave Benefits (Sinhala)"
430
+
431
+ Input: "20250825_Budget2026Proposal_RemovalOfTaxationOnEPF_Tam_F.pdf"
432
+ Output: "EPF Tax Removal (Tamil)"
433
+
434
+ Input: "20250813_Budget2026_Proposal_ExpandingIndustrialLand_En_F.pdf"
435
+ Output: "Industrial Land Expansion (EN)"
436
+
437
+ Input: "20250813_Budget2026Proposal_MaternityLeaveBenefit_Raj_D01.pdf"
438
+ Output: "Maternity Leave Benefits" (no language suffix)
439
+
440
+ HOW TO ADD NEW DOCUMENTS:
441
+ =========================
442
+ 1. Drop the PDF/DOCX file in the assets/pdfs/ folder
443
+ 2. The system will automatically generate a short name using pattern matching
444
+ 3. If you want a custom name, add it to the 'short_names' dictionary
445
+ 4. No code changes needed for automatic naming!
446
+ """
447
  # Remove file extension
448
  name = filename.replace('.pdf', '').replace('.docx', '')
449
 
 
451
  short_names = {
452
  '20241211_Econ_VRProposals_Budget2025_OnePagers': 'Budget 2025 One-Pagers',
453
  '20250813_Budget2026_Proposal_ExpandingIndustrialLand_En_F': 'Industrial Land Expansion (EN)',
454
+ '20250813_Budget2026Proposal_MaternityLeaveBenefit_Raj_D01': 'Maternity Leave Benefits',
455
+ '20250813_Budget2026Proposal_RemovalOfTaxationOnEPF_Raj_F': 'EPF Tax Removal',
456
  '20250825_Budget2026Proposal_MaternityLeaveBenefit_Sin_F': 'Maternity Leave Benefits (Sinhala)',
457
  '20250825_Budget2026Proposal_MaternityLeaveBenefit_Tam_F': 'Maternity Leave Benefits (Tamil)',
458
  '20250825_Budget2026Proposal_RemovalOfTaxationOnEPF_Sin_Final': 'EPF Tax Removal (Sinhala)',
 
546
  '20250908_Budget2026Proposal_Template.pdf'
547
  ]
548
 
549
+ def extract_sources_from_search_context(search_context: str, user_language: str = 'en') -> List[Dict[str, str]]:
550
+ """Extract source documents from search context with short names, filtered by user language"""
551
  sources = []
552
 
553
  # Get dynamically available PDF files
 
571
  if pdf in search_context:
572
  found_files.add(pdf)
573
 
574
+ # Filter by user language preference
575
+ language_filtered_files = set()
576
+
577
  for pdf in found_files:
578
+ # Determine document language from filename
579
+ doc_language = get_document_language(pdf)
580
+
581
+ # Language matching logic
582
+ if user_language == 'en' or user_language == 'singlish':
583
+ # English users can see all documents, but prefer English versions
584
+ if doc_language in ['en', 'english'] or user_language == 'singlish':
585
+ language_filtered_files.add(pdf)
586
+ elif not any(f for f in found_files if get_document_language(f) in ['en', 'english']):
587
+ # If no English version available, show other languages
588
+ language_filtered_files.add(pdf)
589
+ elif user_language == 'si' or user_language == 'sinhala':
590
+ # Sinhala users prefer Sinhala documents
591
+ if doc_language in ['si', 'sinhala']:
592
+ language_filtered_files.add(pdf)
593
+ elif not any(f for f in found_files if get_document_language(f) in ['si', 'sinhala']):
594
+ # If no Sinhala version available, show other languages
595
+ language_filtered_files.add(pdf)
596
+ elif user_language == 'ta' or user_language == 'tamil':
597
+ # Tamil users prefer Tamil documents
598
+ if doc_language in ['ta', 'tamil']:
599
+ language_filtered_files.add(pdf)
600
+ elif not any(f for f in found_files if get_document_language(f) in ['ta', 'tamil']):
601
+ # If no Tamil version available, show other languages
602
+ language_filtered_files.add(pdf)
603
+ else:
604
+ # Default: show all documents
605
+ language_filtered_files.add(pdf)
606
+
607
+ # Convert to list with short names
608
+ for pdf in language_filtered_files:
609
  sources.append({
610
  "filename": pdf,
611
  "short_name": get_short_document_name(pdf)
 
613
 
614
  return sources
615
 
616
+ def get_document_language(filename: str) -> str:
617
+ """Determine the language of a document from its filename"""
618
+ filename_lower = filename.lower()
619
+
620
+ if '_sin_' in filename_lower or '_sinhala_' in filename_lower:
621
+ return 'si'
622
+ elif '_tam_' in filename_lower or '_tamil_' in filename_lower:
623
+ return 'ta'
624
+ elif '_raj_' in filename_lower:
625
+ return 'en' # Treat Raj as English/default
626
+ elif '_en_' in filename_lower or '_english_' in filename_lower:
627
+ return 'en'
628
+ else:
629
+ # Default to English if no language indicator found
630
+ return 'en'
631
+
632
  def extract_sources_from_response(response: str) -> List[Dict[str, str]]:
633
  """Extract source documents mentioned in the response with short names (fallback method)"""
634
  sources = []
 
748
  # No need to translate response - Gemini handles language matching automatically
749
 
750
  # Extract sources from search context (where the actual filenames are)
751
+ sources = extract_sources_from_search_context(search_context, original_language)
752
 
753
  # Add messages to memory (store original user message for context)
754
  memory.chat_memory.add_user_message(user_message)