danulr05 commited on
Commit
b909fee
·
verified ·
1 Parent(s): 3c720c2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -80
app.py CHANGED
@@ -48,7 +48,7 @@ if not PINECONE_API_KEY:
48
 
49
  # Initialize Pinecone and embedding model
50
  pc = Pinecone(api_key=PINECONE_API_KEY)
51
- BUDGET_INDEX_NAME = "budget-proposals"
52
  embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
53
 
54
  # Initialize LangChain components
@@ -68,62 +68,45 @@ def get_pinecone_index():
68
  return None
69
 
70
  def search_budget_proposals(query: str) -> str:
71
- """Search budget proposals directly from your new Pinecone index"""
72
  try:
73
- # Get Pinecone index
74
- pc_index = get_pinecone_index()
75
- if not pc_index:
76
- return "Error: Unable to connect to vector database."
77
-
78
- # Generate query embedding
79
- query_embedding = embed_model.encode(query).tolist()
80
 
81
- # Search in Pinecone with improved chunking metadata
82
- search_results = pc_index.query(
83
- vector=query_embedding,
84
- top_k=5,
85
- include_metadata=True,
86
- filter={"source": "budget_proposals"} # Filter for your budget proposals
87
  )
88
 
89
- if not search_results.matches:
90
- return "No relevant budget proposals found in the database."
91
-
92
- # Build context from search results with improved chunking metadata
93
- context_parts = []
94
- seen_files = set() # Avoid duplicate files
95
-
96
- for match in search_results.matches[:3]: # Limit to top 3 results
97
- metadata = match.metadata
98
- file_path = metadata.get("file_path", "")
99
- category = metadata.get("category", "")
100
- title = metadata.get("title", "")
101
- cost = metadata.get("costLKR", "")
102
- chunk_id = metadata.get("chunk_id", 0)
103
- quality_score = metadata.get("chunk_quality_score", 0)
104
- token_count = metadata.get("token_count", 0)
105
 
106
- # Skip if we've already included this file
107
- if file_path in seen_files:
108
- continue
109
- seen_files.add(file_path)
110
 
111
- context_parts.append(f"📄 From {file_path} ({category}): {title}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
- # Add chunk content (this would be the actual text chunk)
114
- # Note: The chunk text isn't stored in metadata, it would need to be retrieved separately
115
- # For now, we'll use the title and metadata
116
-
117
- if cost and cost != "No Costing Available":
118
- context_parts.append(f"💰 Cost: {cost}")
119
-
120
- # Add quality info for debugging
121
- context_parts.append(f"📊 Quality: {quality_score:.3f}, Tokens: {token_count}, Chunk: {chunk_id}")
122
-
123
- if not context_parts:
124
- return "No relevant budget proposals found in the database."
125
-
126
- return "\n\n".join(context_parts)
127
 
128
  except Exception as e:
129
  logger.error(f"Error searching budget proposals: {e}")
@@ -206,11 +189,11 @@ def get_available_pdfs() -> List[str]:
206
  return pdf_files
207
  else:
208
  # Fallback to known PDFs if directory doesn't exist
209
- return ['1750164001872.pdf', 'Audit_EPF.pdf', 'Cigs.pdf', 'Discretion.pdf', 'Elec.pdf', 'EPF.pdf', 'MLB.pdf']
210
  except Exception as e:
211
  logger.error(f"Error getting available PDFs: {e}")
212
- # Fallback to known PDFs from your processing log
213
- return ['1750164001872.pdf', 'Audit_EPF.pdf', 'Cigs.pdf', 'Discretion.pdf', 'Elec.pdf', 'EPF.pdf', 'MLB.pdf']
214
 
215
  def extract_sources_from_response(response: str) -> List[str]:
216
  """Extract source documents mentioned in the response"""
@@ -433,37 +416,19 @@ def get_chat_history(session_id: str):
433
  def chat_health():
434
  """Health check for the enhanced chatbot"""
435
  try:
 
 
 
 
436
  # Test vector database connection
437
  pc_index = get_pinecone_index()
438
- vector_db_status = "disconnected"
439
- index_stats = {}
440
-
441
- if pc_index:
442
- try:
443
- # Test actual connection with a quick query
444
- stats = pc_index.describe_index_stats()
445
- vector_db_status = "connected"
446
- index_stats = {
447
- "total_vectors": stats.total_vector_count,
448
- "index_name": BUDGET_INDEX_NAME
449
- }
450
- except Exception as e:
451
- vector_db_status = f"error: {str(e)}"
452
-
453
- # Test LangChain connection
454
- try:
455
- test_agent = create_agent("health_check")
456
- test_response = test_agent.invoke({"input": "Hello"})
457
- langchain_status = "connected" if test_response else "disconnected"
458
- except Exception as e:
459
- langchain_status = f"error: {str(e)}"
460
 
461
  return jsonify({
462
- "status": "healthy" if vector_db_status == "connected" else "partial",
463
  "message": "Enhanced budget proposals chatbot with RAG is running",
464
- "langchain_status": langchain_status,
465
  "vector_db_status": vector_db_status,
466
- "index_stats": index_stats,
467
  "rag_enabled": True,
468
  "active_sessions": len(conversation_memories),
469
  "memory_enabled": True
@@ -558,5 +523,4 @@ def home():
558
  })
559
 
560
  if __name__ == '__main__':
561
- app.run(debug=False, host='0.0.0.0', port=7860)
562
-
 
48
 
49
  # Initialize Pinecone and embedding model
50
  pc = Pinecone(api_key=PINECONE_API_KEY)
51
+ BUDGET_INDEX_NAME = "budget-proposals-index"
52
  embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
53
 
54
  # Initialize LangChain components
 
68
  return None
69
 
70
  def search_budget_proposals(query: str) -> str:
71
+ """Search budget proposals using the semantic search API"""
72
  try:
73
+ import requests
 
 
 
 
 
 
74
 
75
+ # Use the deployed semantic search API
76
+ response = requests.post(
77
+ f"https://danulr05-budget-proposals-search-api.hf.space/api/search",
78
+ json={"query": query, "top_k": 5},
79
+ timeout=10
 
80
  )
81
 
82
+ if response.status_code == 200:
83
+ data = response.json()
84
+ results = data.get("results", [])
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ if not results:
87
+ return "No relevant budget proposals found in the database."
 
 
88
 
89
+ # Build context from search results
90
+ context_parts = []
91
+ for result in results[:3]: # Limit to top 3 results
92
+ file_path = result.get("file_path", "")
93
+ category = result.get("category", "")
94
+ summary = result.get("summary", "")
95
+ cost = result.get("costLKR", "")
96
+ title = result.get("title", "")
97
+ content = result.get("content", "") # Get the actual content
98
+
99
+ context_parts.append(f"From {file_path} ({category}): {title}")
100
+ if content:
101
+ context_parts.append(f"Content: {content}")
102
+ elif summary:
103
+ context_parts.append(f"Summary: {summary}")
104
+ if cost and cost != "No Costing Available":
105
+ context_parts.append(f"Cost: {cost}")
106
 
107
+ return "\n\n".join(context_parts)
108
+ else:
109
+ return f"Error accessing semantic search API: {response.status_code}"
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  except Exception as e:
112
  logger.error(f"Error searching budget proposals: {e}")
 
189
  return pdf_files
190
  else:
191
  # Fallback to known PDFs if directory doesn't exist
192
+ return ['MLB.pdf', 'Cigs.pdf', 'Elec.pdf', 'Audit_EPF.pdf', 'EPF.pdf', 'Discretion.pdf', '1750164001872.pdf']
193
  except Exception as e:
194
  logger.error(f"Error getting available PDFs: {e}")
195
+ # Fallback to known PDFs
196
+ return ['MLB.pdf', 'Cigs.pdf', 'Elec.pdf', 'Audit_EPF.pdf', 'EPF.pdf', 'Discretion.pdf', '1750164001872.pdf']
197
 
198
  def extract_sources_from_response(response: str) -> List[str]:
199
  """Extract source documents mentioned in the response"""
 
416
  def chat_health():
417
  """Health check for the enhanced chatbot"""
418
  try:
419
+ # Test LangChain connection and vector database
420
+ test_agent = create_agent("health_check")
421
+ test_response = test_agent.invoke({"input": "Hello"})
422
+
423
  # Test vector database connection
424
  pc_index = get_pinecone_index()
425
+ vector_db_status = "connected" if pc_index else "disconnected"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
426
 
427
  return jsonify({
428
+ "status": "healthy",
429
  "message": "Enhanced budget proposals chatbot with RAG is running",
430
+ "langchain_status": "connected" if test_response else "disconnected",
431
  "vector_db_status": vector_db_status,
 
432
  "rag_enabled": True,
433
  "active_sessions": len(conversation_memories),
434
  "memory_enabled": True
 
523
  })
524
 
525
  if __name__ == '__main__':
526
+ app.run(debug=False, host='0.0.0.0', port=7860)