Spaces:

danulr05
/

budget-proposals-chatbot-api

Sleeping

App Files Files Community

danulr05 commited on Aug 28

Commit

b909fee

verified ·

1 Parent(s): 3c720c2

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -80

app.py CHANGED Viewed

@@ -48,7 +48,7 @@ if not PINECONE_API_KEY:
 # Initialize Pinecone and embedding model
 pc = Pinecone(api_key=PINECONE_API_KEY)
-BUDGET_INDEX_NAME = "budget-proposals"
 embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 # Initialize LangChain components
@@ -68,62 +68,45 @@ def get_pinecone_index():
         return None
 def search_budget_proposals(query: str) -> str:
-    """Search budget proposals directly from your new Pinecone index"""
     try:
-        # Get Pinecone index
-        pc_index = get_pinecone_index()
-        if not pc_index:
-            return "Error: Unable to connect to vector database."
-        # Generate query embedding
-        query_embedding = embed_model.encode(query).tolist()
-        # Search in Pinecone with improved chunking metadata
-        search_results = pc_index.query(
-            vector=query_embedding,
-            top_k=5,
-            include_metadata=True,
-            filter={"source": "budget_proposals"}  # Filter for your budget proposals
         )
-        if not search_results.matches:
-            return "No relevant budget proposals found in the database."
-        # Build context from search results with improved chunking metadata
-        context_parts = []
-        seen_files = set()  # Avoid duplicate files
-        for match in search_results.matches[:3]:  # Limit to top 3 results
-            metadata = match.metadata
-            file_path = metadata.get("file_path", "")
-            category = metadata.get("category", "")
-            title = metadata.get("title", "")
-            cost = metadata.get("costLKR", "")
-            chunk_id = metadata.get("chunk_id", 0)
-            quality_score = metadata.get("chunk_quality_score", 0)
-            token_count = metadata.get("token_count", 0)
-            # Skip if we've already included this file
-            if file_path in seen_files:
-                continue
-            seen_files.add(file_path)
-            context_parts.append(f"📄 From {file_path} ({category}): {title}")
-            # Add chunk content (this would be the actual text chunk)
-            # Note: The chunk text isn't stored in metadata, it would need to be retrieved separately
-            # For now, we'll use the title and metadata
-            if cost and cost != "No Costing Available":
-                context_parts.append(f"💰 Cost: {cost}")
-            # Add quality info for debugging
-            context_parts.append(f"📊 Quality: {quality_score:.3f}, Tokens: {token_count}, Chunk: {chunk_id}")
-        if not context_parts:
-            return "No relevant budget proposals found in the database."
-        return "\n\n".join(context_parts)
     except Exception as e:
         logger.error(f"Error searching budget proposals: {e}")
@@ -206,11 +189,11 @@ def get_available_pdfs() -> List[str]:
             return pdf_files
         else:
             # Fallback to known PDFs if directory doesn't exist
-            return ['1750164001872.pdf', 'Audit_EPF.pdf', 'Cigs.pdf', 'Discretion.pdf', 'Elec.pdf', 'EPF.pdf', 'MLB.pdf']
     except Exception as e:
         logger.error(f"Error getting available PDFs: {e}")
-        # Fallback to known PDFs from your processing log
-        return ['1750164001872.pdf', 'Audit_EPF.pdf', 'Cigs.pdf', 'Discretion.pdf', 'Elec.pdf', 'EPF.pdf', 'MLB.pdf']
 def extract_sources_from_response(response: str) -> List[str]:
     """Extract source documents mentioned in the response"""
@@ -433,37 +416,19 @@ def get_chat_history(session_id: str):
 def chat_health():
     """Health check for the enhanced chatbot"""
     try:
         # Test vector database connection
         pc_index = get_pinecone_index()
-        vector_db_status = "disconnected"
-        index_stats = {}
-        if pc_index:
-            try:
-                # Test actual connection with a quick query
-                stats = pc_index.describe_index_stats()
-                vector_db_status = "connected"
-                index_stats = {
-                    "total_vectors": stats.total_vector_count,
-                    "index_name": BUDGET_INDEX_NAME
-                }
-            except Exception as e:
-                vector_db_status = f"error: {str(e)}"
-        # Test LangChain connection
-        try:
-            test_agent = create_agent("health_check")
-            test_response = test_agent.invoke({"input": "Hello"})
-            langchain_status = "connected" if test_response else "disconnected"
-        except Exception as e:
-            langchain_status = f"error: {str(e)}"
         return jsonify({
-            "status": "healthy" if vector_db_status == "connected" else "partial",
             "message": "Enhanced budget proposals chatbot with RAG is running",
-            "langchain_status": langchain_status,
             "vector_db_status": vector_db_status,
-            "index_stats": index_stats,
             "rag_enabled": True,
             "active_sessions": len(conversation_memories),
             "memory_enabled": True
@@ -558,5 +523,4 @@ def home():
     })
 if __name__ == '__main__':
-    app.run(debug=False, host='0.0.0.0', port=7860)

 # Initialize Pinecone and embedding model
 pc = Pinecone(api_key=PINECONE_API_KEY)
+BUDGET_INDEX_NAME = "budget-proposals-index"
 embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 # Initialize LangChain components
         return None
 def search_budget_proposals(query: str) -> str:
+    """Search budget proposals using the semantic search API"""
     try:
+        import requests
+        # Use the deployed semantic search API
+        response = requests.post(
+            f"https://danulr05-budget-proposals-search-api.hf.space/api/search",
+            json={"query": query, "top_k": 5},
+            timeout=10
         )
+        if response.status_code == 200:
+            data = response.json()
+            results = data.get("results", [])
+            if not results:
+                return "No relevant budget proposals found in the database."
+            # Build context from search results
+            context_parts = []
+            for result in results[:3]:  # Limit to top 3 results
+                file_path = result.get("file_path", "")
+                category = result.get("category", "")
+                summary = result.get("summary", "")
+                cost = result.get("costLKR", "")
+                title = result.get("title", "")
+                content = result.get("content", "")  # Get the actual content
+                context_parts.append(f"From {file_path} ({category}): {title}")
+                if content:
+                    context_parts.append(f"Content: {content}")
+                elif summary:
+                    context_parts.append(f"Summary: {summary}")
+                if cost and cost != "No Costing Available":
+                    context_parts.append(f"Cost: {cost}")
+            return "\n\n".join(context_parts)
+        else:
+            return f"Error accessing semantic search API: {response.status_code}"
     except Exception as e:
         logger.error(f"Error searching budget proposals: {e}")
             return pdf_files
         else:
             # Fallback to known PDFs if directory doesn't exist
+            return ['MLB.pdf', 'Cigs.pdf', 'Elec.pdf', 'Audit_EPF.pdf', 'EPF.pdf', 'Discretion.pdf', '1750164001872.pdf']
     except Exception as e:
         logger.error(f"Error getting available PDFs: {e}")
+        # Fallback to known PDFs
+        return ['MLB.pdf', 'Cigs.pdf', 'Elec.pdf', 'Audit_EPF.pdf', 'EPF.pdf', 'Discretion.pdf', '1750164001872.pdf']
 def extract_sources_from_response(response: str) -> List[str]:
     """Extract source documents mentioned in the response"""
 def chat_health():
     """Health check for the enhanced chatbot"""
     try:
+        # Test LangChain connection and vector database
+        test_agent = create_agent("health_check")
+        test_response = test_agent.invoke({"input": "Hello"})
         # Test vector database connection
         pc_index = get_pinecone_index()
+        vector_db_status = "connected" if pc_index else "disconnected"
         return jsonify({
+            "status": "healthy",
             "message": "Enhanced budget proposals chatbot with RAG is running",
+            "langchain_status": "connected" if test_response else "disconnected",
             "vector_db_status": vector_db_status,
             "rag_enabled": True,
             "active_sessions": len(conversation_memories),
             "memory_enabled": True
     })
 if __name__ == '__main__':
+    app.run(debug=False, host='0.0.0.0', port=7860)