Spaces:

tomvaillant
/

graphics-llm

Running

Tom Claude commited on Nov 11

Commit

84f99ae

1 Parent(s): ef7af85

feat: Implement hybrid search with word boundaries, reorder UI, and add user API key management

Major improvements to search relevancy, UX, and security:

**Search Optimization**:
- Implement PostgreSQL regex with word boundaries (\m \M) for exact matching
- Fix false positives (e.g., "F1" no longer matches "profile" or "if")
- Update Vanna system prompt with regex guidance and examples
- Create query function templates with hybrid search support

**UI Improvements**:
- Reorder modes: Inspiration (default) → Refinement → Chart
- Rename buttons for clarity and brevity
- Update app description to reflect all modes
- Change "Voir la source" to "Source" for consistency

**API Key Management**:
- Users now provide their own Datawrapper API keys
- Persistent storage via browser localStorage
- Session state management with validation
- Yellow warning box for permissions requirements
- Graceful error handling for missing/invalid keys
- Remove hardcoded DATAWRAPPER_ACCESS_TOKEN dependency

**New Files**:
- src/query_intent_classifier.py: Intent classification for hybrid search
- src/vanna_query_functions.py: SQL template functions with regex

**Technical Details**:
- Word boundary regex: ~* operator with \m and \M markers
- Hybrid search combines tag matching + keyword search with OR logic
- LEFT JOINs ensure untagged posts (7,245+) are included
- JavaScript localStorage integration for API key persistence

🤖 Generated with Claude Code (https://claude.com/claude-code)

Co-Authored-By: Claude <[email protected]>

Files changed (4) hide show

app.py +272 -73
src/query_intent_classifier.py +238 -0
src/vanna.py +87 -32
src/vanna_query_functions.py +300 -0

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ Now with Datawrapper integration for chart generation!
 import os
 import io
 import asyncio
 import pandas as pd
 import gradio as gr
 from dotenv import load_dotenv
@@ -18,6 +19,7 @@ from src.datawrapper_client import create_and_publish_chart, get_iframe_html
 from datetime import datetime, timedelta
 from collections import defaultdict
 from src.vanna import VannaComponent
 # Load environment variables
 load_dotenv()
@@ -54,6 +56,32 @@ except Exception as e:
     print(f"✗ Error initializing Vanna: {e}")
     raise
 def check_rate_limit(request: gr.Request) -> tuple[bool, int]:
     """Check if user has exceeded rate limit"""
     if request is None:
@@ -110,23 +138,41 @@ def recommend_stream(message: str, history: list, request: gr.Request):
         yield f"Error generating response: {str(e)}\n\nPlease check your environment variables (HF_TOKEN, SUPABASE_URL, SUPABASE_KEY) and try again."
-def generate_chart_from_csv(csv_file, user_prompt):
     """
-    Generate a Datawrapper chart from uploaded CSV and user prompt.
     Args:
         csv_file: Uploaded CSV file
         user_prompt: User's description of the chart
     Returns:
         HTML string with iframe or error message
     """
     if not csv_file:
         return "<div style='padding: 50px; text-align: center;'>Please upload a CSV file to generate a chart.</div>"
     if not user_prompt or user_prompt.strip() == "":
         return "<div style='padding: 50px; text-align: center;'>Please describe what chart you want to create.</div>"
     try:
         # Show loading message
         loading_html = """
@@ -192,9 +238,15 @@ def generate_chart_from_csv(csv_file, user_prompt):
         <div style='padding: 50px; text-align: center; color: red;'>
             <h3>❌ Error</h3>
             <p>{str(e)}</p>
-            <p style='font-size: 0.9em; color: #666;'>Please ensure your CSV is properly formatted and try again.</p>
         </div>
         """
 def csv_to_cards_html(csv_text: str) -> str:
     """
@@ -211,11 +263,7 @@ def csv_to_cards_html(csv_text: str) -> str:
             source_url = row.get("source_url", "#")
             author = row.get("author", "Inconnu")
             published_date = row.get("published_date", "")
-            if not published_date == "nan":
-                published_date = ""
-            image_url = row.get("image_url", "")
-            if not image_url == "nan":
-                image_url = "https://fpoimg.com/800x600?text=Image+not+found"
             cards_html += f"""
             <div style="background: white; border-radius: 10px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);
@@ -227,7 +275,7 @@ def csv_to_cards_html(csv_text: str) -> str:
                     <p style="margin:0; color:#999; font-size:0.8em;">{published_date}</p>
                     <a href="{source_url}" target="_blank"
                        style="display:inline-block; margin-top:8px; font-size:0.9em; color:#1976d2; text-decoration:none;">
-                       🔗 Voir la source
                     </a>
                 </div>
             </div>
@@ -262,20 +310,60 @@ async def search_inspiration_from_database(user_prompt):
         """
     try:
-        response = await vanna.ask(user_prompt)
-        print("response :", repr(response))
         clean_response = response.strip()
-        if clean_response.startswith("⚠️") or "Aucun CSV détecté" in clean_response:
             return f"""
             <div style='padding: 50px; text-align: center; color: #d9534f;'>
-                <h3>❌ No valid data found</h3>
-                <p>The AI couldn't generate any data for this request. Try being more specific — for example:
-                   <em>"Show me spotlights from 2020 about design"</em>.</p>
             </div>
             """
         csv_text = (
             clean_response
             .strip("```")
@@ -283,11 +371,15 @@ async def search_inspiration_from_database(user_prompt):
             .replace("CSV", "")
         )
-        if "," not in csv_text:
             return f"""
             <div style='padding: 50px; text-align: center; color: #d9534f;'>
-                <h3>❌ No valid CSV detected</h3>
-                <p>The model didn't return any structured data. Try rephrasing your query to be more precise.</p>
             </div>
             """
@@ -295,11 +387,17 @@ async def search_inspiration_from_database(user_prompt):
         return cards_html
     except Exception as e:
         return f"""
         <div style='padding: 50px; text-align: center; color: red;'>
-            <h3>❌ Error</h3>
-            <p>{str(e)}</p>
-            <p style='font-size: 0.9em; color: #666;'>Please try again.</p>
         </div>
         """
@@ -332,18 +430,63 @@ with gr.Blocks(
     gr.Markdown("""
     # 📊 Viz LLM
-    Get design recommendations or generate charts with AI-powered data visualization assistance.
     """)
-    # Mode selector buttons
     with gr.Row():
-        ideation_btn = gr.Button("💡 Ideation Mode", variant="primary", elem_classes="mode-button")
-        chart_gen_btn = gr.Button("📊 Chart Generation Mode", variant="secondary", elem_classes="mode-button")
-        inspiration_btn = gr.Button("✨ Inspiration Mode", variant="secondary", elem_classes="mode-button")
-    # Ideation Mode: Chat interface (shown by default, wrapped in Column)
-    with gr.Column(visible=True) as ideation_container:
         ideation_interface = gr.ChatInterface(
             fn=recommend_stream,
             type="messages",
@@ -360,6 +503,32 @@ with gr.Blocks(
     # Chart Generation Mode: Chart controls and output (hidden by default)
     with gr.Column(visible=False) as chart_gen_container:
         csv_upload = gr.File(
             label="📁 Upload CSV File",
             file_types=[".csv"],
@@ -379,79 +548,111 @@ with gr.Blocks(
             label="Generated Chart"
         )
-    # Inspiration Mode:
-    with gr.Column(visible=False) as inspiration_container:
-      with gr.Row():
-        inspiration_prompt_input = gr.Textbox(
-            placeholder="Ask for an inspiration...",
-            show_label=False,
-            scale=4,
-            container=False
-        )
-        inspiration_search_btn = gr.Button("🔍 Search", variant="primary", scale=1)
-      inspiration_cards_html = gr.HTML("")
-    # Mode switching functions
     def switch_to_ideation():
         return [
             gr.update(variant="primary"),  # ideation_btn
             gr.update(variant="secondary"),  # chart_gen_btn
-            gr.update(variant="secondary"),  # inspiration_btn
             gr.update(visible=True),  # ideation_container
             gr.update(visible=False),  # chart_gen_container
-            gr.update(visible=False),  # inspiration_container
         ]
     def switch_to_chart_gen():
         return [
             gr.update(variant="secondary"),  # ideation_btn
             gr.update(variant="primary"),  # chart_gen_btn
-            gr.update(variant="secondary"),  # inspiration_btn
             gr.update(visible=False),  # ideation_container
             gr.update(visible=True),  # chart_gen_container
-            gr.update(visible=False),  # inspiration_container
         ]
-    def switch_to_inspiration():
-        return [
-            gr.update(variant="secondary"),  # ideation_btn
-            gr.update(variant="secondary"),  # chart_gen_btn
-            gr.update(variant="primary"),  # inspiration_btn
-            gr.update(visible=False),  # ideation_container
-            gr.update(visible=False),  # chart_gen_container
-            gr.update(visible=True),  # inspiration_container
-        ]
-    # Wire up mode switching
     ideation_btn.click(
         fn=switch_to_ideation,
         inputs=[],
-        outputs=[ideation_btn, chart_gen_btn, inspiration_btn, ideation_container, chart_gen_container, inspiration_container]
     )
     chart_gen_btn.click(
         fn=switch_to_chart_gen,
         inputs=[],
-        outputs=[ideation_btn, chart_gen_btn, inspiration_btn, ideation_container, chart_gen_container, inspiration_container]
     )
-    inspiration_btn.click(
-        fn=switch_to_inspiration,
-        inputs=[],
-        outputs=[ideation_btn, chart_gen_btn, inspiration_btn, ideation_container, chart_gen_container, inspiration_container]
     )
-    # Generate chart when button is clicked
     generate_chart_btn.click(
         fn=generate_chart_from_csv,
-        inputs=[csv_upload, chart_prompt_input],
         outputs=[chart_output]
     )
-    # Search inspiration when button is clicked
     inspiration_search_btn.click(
-        fn=search_inspiration_from_database,
         inputs=[inspiration_prompt_input],
         outputs=[inspiration_cards_html]
     )
@@ -460,11 +661,11 @@ with gr.Blocks(
     gr.Markdown("""
     ### About Viz LLM
-    **Ideation Mode:** Get design recommendations based on research papers, design principles, and examples from the field of information graphics and data visualization.
-    **Chart Generation Mode:** Upload your CSV data and describe your visualization goal. The AI will analyze your data, select the optimal chart type, and generate a publication-ready chart using Datawrapper.
-    **Inspiration Mode:** Coming soon.
     **Credits:** Special thanks to the researchers whose work informed this model: Robert Kosara, Edward Segel, Jeffrey Heer, Matthew Conlen, John Maeda, Kennedy Elliott, Scott McCloud, and many others.
@@ -473,21 +674,19 @@ with gr.Blocks(
     **Usage Limits:** This service is limited to 20 queries per day per user to manage costs. Responses are optimized for English.
     <div style="text-align: center; margin-top: 20px; opacity: 0.6; font-size: 0.9em;">
-    Embeddings: Jina-CLIP-v2 | Charts: Datawrapper API
     </div>
     """)
 # Launch configuration
 if __name__ == "__main__":
-    # Check for required environment variables
-    required_vars = ["SUPABASE_URL", "SUPABASE_KEY", "HF_TOKEN", "DATAWRAPPER_ACCESS_TOKEN"]
     missing_vars = [var for var in required_vars if not os.getenv(var)]
     if missing_vars:
         print(f"⚠️  Warning: Missing environment variables: {', '.join(missing_vars)}")
         print("Please set these in your .env file or as environment variables")
-        if "DATAWRAPPER_ACCESS_TOKEN" in missing_vars:
-            print("Note: DATAWRAPPER_ACCESS_TOKEN is required for chart generation mode")
     # Launch the app
     demo.launch(

 import os
 import io
 import asyncio
+import time
 import pandas as pd
 import gradio as gr
 from dotenv import load_dotenv
 from datetime import datetime, timedelta
 from collections import defaultdict
 from src.vanna import VannaComponent
+from src.query_intent_classifier import classify_query, IntentClassifier
 # Load environment variables
 load_dotenv()
     print(f"✗ Error initializing Vanna: {e}")
     raise
+# CSV cleanup function
+def cleanup_old_csv_files():
+    """Delete CSV files older than 24 hours to prevent accumulation"""
+    folder = "513935c4d2db2d2d"
+    if not os.path.exists(folder):
+        return
+    cleaned = 0
+    for file in os.listdir(folder):
+        if file.endswith(".csv"):
+            file_path = os.path.join(folder, file)
+            try:
+                # Check if file is older than 24 hours
+                if os.path.getmtime(file_path) < time.time() - 86400:
+                    os.remove(file_path)
+                    cleaned += 1
+            except Exception as e:
+                print(f"Warning: Could not delete {file_path}: {e}")
+    if cleaned > 0:
+        print(f"✓ Cleaned up {cleaned} old CSV files")
+# Run cleanup on startup
+print("Cleaning up old CSV files...")
+cleanup_old_csv_files()
 def check_rate_limit(request: gr.Request) -> tuple[bool, int]:
     """Check if user has exceeded rate limit"""
     if request is None:
         yield f"Error generating response: {str(e)}\n\nPlease check your environment variables (HF_TOKEN, SUPABASE_URL, SUPABASE_KEY) and try again."
+def generate_chart_from_csv(csv_file, user_prompt, api_key):
     """
+    Generate a Datawrapper chart from uploaded CSV and user prompt using user's API key.
     Args:
         csv_file: Uploaded CSV file
         user_prompt: User's description of the chart
+        api_key: User's Datawrapper API key
     Returns:
         HTML string with iframe or error message
     """
+    # Validate API key first
+    if not api_key or api_key.strip() == "":
+        return """
+        <div style='padding: 50px; text-align: center; color: #d9534f;'>
+            <h3>❌ No API Key Provided</h3>
+            <p>Please enter your Datawrapper API key above to generate charts.</p>
+            <p style='margin-top: 15px;'>
+                <a href='https://app.datawrapper.de/account/api-tokens' target='_blank'
+                   style='color: #1976d2; text-decoration: underline;'>Get your API key →</a>
+            </p>
+        </div>
+        """
     if not csv_file:
         return "<div style='padding: 50px; text-align: center;'>Please upload a CSV file to generate a chart.</div>"
     if not user_prompt or user_prompt.strip() == "":
         return "<div style='padding: 50px; text-align: center;'>Please describe what chart you want to create.</div>"
+    # Temporarily set the API key in environment for this request
+    original_key = os.environ.get("DATAWRAPPER_ACCESS_TOKEN")
+    os.environ["DATAWRAPPER_ACCESS_TOKEN"] = api_key
     try:
         # Show loading message
         loading_html = """
         <div style='padding: 50px; text-align: center; color: red;'>
             <h3>❌ Error</h3>
             <p>{str(e)}</p>
+            <p style='font-size: 0.9em; color: #666;'>Please ensure your CSV is properly formatted and your API key is correct.</p>
         </div>
         """
+    finally:
+        # Restore original API key or remove it
+        if original_key:
+            os.environ["DATAWRAPPER_ACCESS_TOKEN"] = original_key
+        elif "DATAWRAPPER_ACCESS_TOKEN" in os.environ:
+            del os.environ["DATAWRAPPER_ACCESS_TOKEN"]
 def csv_to_cards_html(csv_text: str) -> str:
     """
             source_url = row.get("source_url", "#")
             author = row.get("author", "Inconnu")
             published_date = row.get("published_date", "")
+            image_url = row.get("image_url", "https://fpoimg.com/800x600?text=Image+not+found")
             cards_html += f"""
             <div style="background: white; border-radius: 10px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);
                     <p style="margin:0; color:#999; font-size:0.8em;">{published_date}</p>
                     <a href="{source_url}" target="_blank"
                        style="display:inline-block; margin-top:8px; font-size:0.9em; color:#1976d2; text-decoration:none;">
+                       🔗 Source
                     </a>
                 </div>
             </div>
         """
     try:
+        # Classify user intent
+        print(f"\n{'='*60}")
+        print(f"[SEARCH] User prompt: {user_prompt}")
+        classifier = IntentClassifier()
+        classification = classifier.classify(user_prompt)
+        print(f"[INTENT] Type: {classification['intent'].value}")
+        print(f"[INTENT] Keywords: {classification['keywords']}")
+        print(f"[INTENT] Inferred tags: {classification['tags']}")
+        print(f"[INTENT] Short query: {classification['is_short_query']}")
+        # Enhance prompt with intent guidance
+        enhanced_prompt = classifier.format_for_vanna(classification)
+        full_prompt = f"{user_prompt}\n\n{enhanced_prompt}"
+        print(f"[VANNA] Sending enhanced prompt to Vanna...")
+        response = await vanna.ask(full_prompt)
+        print(f"[VANNA] Response received: {repr(response)[:200]}...")
+        print(f"{'='*60}\n")
         clean_response = response.strip()
+        # Check for empty query results (0 rows returned)
+        if "No rows returned" in clean_response or "0 rows" in clean_response.lower():
+            return f"""
+            <div style='padding: 50px; text-align: center; color: #f0ad4e;'>
+                <h3>🔍 No Results Found</h3>
+                <p>Your query was executed successfully, but no posts matched your criteria.</p>
+                <p style='margin-top: 15px; font-weight: 600;'>Suggestions:</p>
+                <ul style='list-style: none; padding: 0; text-align: left; display: inline-block;'>
+                    <li>• Try broader keywords (e.g., "visualization" instead of "F1 dataviz")</li>
+                    <li>• Search by author names (e.g., "New York Times")</li>
+                    <li>• Use simple terms (e.g., "interactive", "maps")</li>
+                </ul>
+                <p style='margin-top: 15px; font-style: italic; color: #666; font-size: 0.9em;'>
+                    <strong>Note:</strong> Most posts are currently being enriched with tags.<br/>
+                    Keyword search works for all {classification.get('total_posts', '7,000+')} posts in the database.
+                </p>
+            </div>
+            """
+        # Check for errors or warnings
+        if clean_response.startswith("⚠️") or clean_response.startswith("❌") or "Aucun CSV détecté" in clean_response:
             return f"""
             <div style='padding: 50px; text-align: center; color: #d9534f;'>
+                <h3>❌ Query Error</h3>
+                <p>The AI encountered an issue processing your request.</p>
+                <p style='margin-top: 10px; font-size: 0.9em; color: #666;'>{clean_response[:200]}</p>
+                <p style='margin-top: 15px;'>Try rephrasing your query or being more specific.</p>
             </div>
             """
+        # Process CSV response
         csv_text = (
             clean_response
             .strip("```")
             .replace("CSV", "")
         )
+        # Check if response contains CSV data
+        if "," not in csv_text or "id,title" not in csv_text.lower():
             return f"""
             <div style='padding: 50px; text-align: center; color: #d9534f;'>
+                <h3>❌ Invalid Response Format</h3>
+                <p>The database query didn't return structured data.</p>
+                <p style='margin-top: 10px; font-size: 0.9em; color: #666;'>
+                    This might be a temporary issue. Please try again.
+                </p>
             </div>
             """
         return cards_html
     except Exception as e:
+        print(f"❌ Exception in search_inspiration_from_database: {str(e)}")
+        import traceback
+        traceback.print_exc()
         return f"""
         <div style='padding: 50px; text-align: center; color: red;'>
+            <h3>❌ System Error</h3>
+            <p style='margin-bottom: 10px;'>An unexpected error occurred:</p>
+            <p style='font-family: monospace; font-size: 0.85em; color: #666;'>{str(e)}</p>
+            <p style='margin-top: 15px; font-size: 0.9em; color: #666;'>
+                Please check the console logs for more details.
+            </p>
         </div>
         """
     gr.Markdown("""
     # 📊 Viz LLM
+    Discover inspiring visualizations, refine your design ideas, or generate publication-ready charts with AI assistance.
+    """)
+    # JavaScript for localStorage persistence
+    gr.HTML("""
+    <script>
+        // Save API key to localStorage when it changes
+        function saveApiKeyToStorage(key) {
+            if (key && key.trim() !== '') {
+                localStorage.setItem('datawrapper_api_key', key);
+            }
+        }
+        // Load API key from localStorage on page load
+        function loadApiKeyFromStorage() {
+            return localStorage.getItem('datawrapper_api_key') || '';
+        }
+        // Auto-load API key when the page loads
+        window.addEventListener('DOMContentLoaded', function() {
+            setTimeout(function() {
+                const savedKey = loadApiKeyFromStorage();
+                if (savedKey) {
+                    const apiKeyInput = document.querySelector('input[type="password"]');
+                    if (apiKeyInput) {
+                        apiKeyInput.value = savedKey;
+                        // Trigger change event to update Gradio state
+                        apiKeyInput.dispatchEvent(new Event('input', { bubbles: true }));
+                    }
+                }
+            }, 1000);
+        });
+    </script>
     """)
+    # Mode selector buttons (reordered: Inspiration, Refinement, Chart)
     with gr.Row():
+        inspiration_btn = gr.Button("✨ Inspiration", variant="primary", elem_classes="mode-button")
+        ideation_btn = gr.Button("💡 Refinement", variant="secondary", elem_classes="mode-button")
+        chart_gen_btn = gr.Button("📊 Chart", variant="secondary", elem_classes="mode-button")
+    # Inspiration Mode: Search interface (shown by default)
+    with gr.Column(visible=True) as inspiration_container:
+      with gr.Row():
+        inspiration_prompt_input = gr.Textbox(
+            placeholder="Search for inspiration (e.g., 'F1', 'interactive maps')...",
+            show_label=False,
+            scale=4,
+            container=False
+        )
+        inspiration_search_btn = gr.Button("🔍 Search", variant="primary", scale=1)
+      inspiration_cards_html = gr.HTML("")
+    # Refinement Mode: Chat interface (hidden by default, wrapped in Column)
+    with gr.Column(visible=False) as ideation_container:
         ideation_interface = gr.ChatInterface(
             fn=recommend_stream,
             type="messages",
     # Chart Generation Mode: Chart controls and output (hidden by default)
     with gr.Column(visible=False) as chart_gen_container:
+        gr.Markdown("### Chart Generator")
+        # API Key Input (collapsible)
+        with gr.Accordion("🔑 Datawrapper API Key", open=False):
+            gr.Markdown("""
+            Enter your Datawrapper API key to generate charts. Your key is stored in your browser and persists across sessions.
+            **Get your key**: [Datawrapper Account Settings](https://app.datawrapper.de/account/api-tokens)
+            """)
+            # Warning about permissions
+            gr.HTML("""
+            <div style="background: #fff3cd; border: 1px solid #ffc107; border-radius: 5px; padding: 12px; margin: 10px 0;">
+                <strong>⚠️ Important:</strong> When creating your API key, toggle <strong>ALL permissions</strong> (Read & Write for Charts, Tables, Folders, etc.) otherwise chart generation will fail.
+            </div>
+            """)
+            api_key_input = gr.Textbox(
+                label="API Key",
+                placeholder="Paste your Datawrapper API key here...",
+                type="password",
+                value=""
+            )
+            api_key_status = gr.Markdown("⚠️ Status: No API key provided")
         csv_upload = gr.File(
             label="📁 Upload CSV File",
             file_types=[".csv"],
             label="Generated Chart"
         )
+    # API key state management
+    api_key_state = gr.State(value="")
+    def validate_api_key(api_key: str) -> tuple[str, str]:
+        """Validate and store API key"""
+        if not api_key or api_key.strip() == "":
+            return "", "⚠️ Status: No API key provided"
+        # Basic validation (check format)
+        if len(api_key) < 20:
+            return "", "❌ Status: Invalid API key format (too short)"
+        # Key looks valid - it will be saved to localStorage via JavaScript
+        masked_key = f"...{api_key[-6:]}" if len(api_key) > 6 else "***"
+        return api_key, f"✅ Status: API key saved to browser storage (ends with {masked_key})"
+    # Mode switching functions (updated for new order: Inspiration, Refinement, Chart)
+    def switch_to_inspiration():
+        return [
+            gr.update(variant="primary"),  # inspiration_btn
+            gr.update(variant="secondary"),  # ideation_btn
+            gr.update(variant="secondary"),  # chart_gen_btn
+            gr.update(visible=True),  # inspiration_container
+            gr.update(visible=False),  # ideation_container
+            gr.update(visible=False),  # chart_gen_container
+        ]
     def switch_to_ideation():
         return [
+            gr.update(variant="secondary"),  # inspiration_btn
             gr.update(variant="primary"),  # ideation_btn
             gr.update(variant="secondary"),  # chart_gen_btn
+            gr.update(visible=False),  # inspiration_container
             gr.update(visible=True),  # ideation_container
             gr.update(visible=False),  # chart_gen_container
         ]
     def switch_to_chart_gen():
         return [
+            gr.update(variant="secondary"),  # inspiration_btn
             gr.update(variant="secondary"),  # ideation_btn
             gr.update(variant="primary"),  # chart_gen_btn
+            gr.update(visible=False),  # inspiration_container
             gr.update(visible=False),  # ideation_container
             gr.update(visible=True),  # chart_gen_container
         ]
+    # Wire up mode switching (updated order: inspiration, ideation, chart)
+    inspiration_btn.click(
+        fn=switch_to_inspiration,
+        inputs=[],
+        outputs=[inspiration_btn, ideation_btn, chart_gen_btn, inspiration_container, ideation_container, chart_gen_container]
+    )
     ideation_btn.click(
         fn=switch_to_ideation,
         inputs=[],
+        outputs=[inspiration_btn, ideation_btn, chart_gen_btn, inspiration_container, ideation_container, chart_gen_container]
     )
     chart_gen_btn.click(
         fn=switch_to_chart_gen,
         inputs=[],
+        outputs=[inspiration_btn, ideation_btn, chart_gen_btn, inspiration_container, ideation_container, chart_gen_container]
     )
+    # Connect API key validation and localStorage save
+    api_key_input.change(
+        fn=validate_api_key,
+        inputs=[api_key_input],
+        outputs=[api_key_state, api_key_status],
+        js="(key) => { saveApiKeyToStorage(key); return key; }"
     )
+    # Generate chart when button is clicked (now with API key)
     generate_chart_btn.click(
         fn=generate_chart_from_csv,
+        inputs=[csv_upload, chart_prompt_input, api_key_state],
         outputs=[chart_output]
     )
+    # Search inspiration with loading state
+    def search_with_loading(prompt):
+        """Wrapper to show loading state"""
+        if not prompt or not prompt.strip():
+            return """
+            <div style='padding: 50px; text-align: center;'>
+                Please enter a search query.
+            </div>
+            """
+        # Show loading immediately (Gradio will display this first)
+        yield """
+        <div style='padding: 50px; text-align: center;'>
+            <div style='font-size: 2em; margin-bottom: 20px;'>🔍</div>
+            <h3>Searching database...</h3>
+            <p style='color: #666;'>Analyzing your query and generating SQL...</p>
+        </div>
+        """
+        # Run the actual search
+        import asyncio
+        result = asyncio.run(search_inspiration_from_database(prompt))
+        yield result
     inspiration_search_btn.click(
+        fn=search_with_loading,
         inputs=[inspiration_prompt_input],
         outputs=[inspiration_cards_html]
     )
     gr.Markdown("""
     ### About Viz LLM
+    **Inspiration**: Discover curated examples of data visualizations and information graphics from publications worldwide. Search by keyword, topic, or author.
+    **Refinement**: Get design recommendations based on research papers, design principles, and examples from the field of information graphics and data visualization.
+    **Chart**: Upload your CSV data and describe your visualization goal. The AI will analyze your data, select the optimal chart type, and generate a publication-ready chart using Datawrapper.
     **Credits:** Special thanks to the researchers whose work informed this model: Robert Kosara, Edward Segel, Jeffrey Heer, Matthew Conlen, John Maeda, Kennedy Elliott, Scott McCloud, and many others.
     **Usage Limits:** This service is limited to 20 queries per day per user to manage costs. Responses are optimized for English.
     <div style="text-align: center; margin-top: 20px; opacity: 0.6; font-size: 0.9em;">
+    Embeddings: Jina-CLIP-v2 | Charts: Datawrapper API | Database: Nuanced
     </div>
     """)
 # Launch configuration
 if __name__ == "__main__":
+    # Check for required environment variables (Datawrapper key now user-provided)
+    required_vars = ["SUPABASE_URL", "SUPABASE_KEY", "HF_TOKEN"]
     missing_vars = [var for var in required_vars if not os.getenv(var)]
     if missing_vars:
         print(f"⚠️  Warning: Missing environment variables: {', '.join(missing_vars)}")
         print("Please set these in your .env file or as environment variables")
     # Launch the app
     demo.launch(

src/query_intent_classifier.py ADDED Viewed

	@@ -0,0 +1,238 @@

+"""
+Query Intent Classifier for Hybrid Search
+Analyzes user queries to determine the best search strategy:
+- keyword: Full-text search on title/author/provider (works for all posts)
+- tag: Tag-based search (works only for tagged posts)
+- hybrid: Try both approaches
+"""
+import re
+from typing import Dict, List
+from enum import Enum
+class QueryIntent(Enum):
+    KEYWORD = "keyword"
+    TAG = "tag"
+    HYBRID = "hybrid"
+class IntentClassifier:
+    """
+    Classifies user queries and extracts relevant search parameters.
+    """
+    # Keywords that suggest tag search
+    TAG_INDICATORS = ["tagged", "category", "topic", "theme", "type", "about"]
+    # Common keywords to expand for better matching
+    KEYWORD_EXPANSIONS = {
+        "f1": ["f1", "formula 1", "formula one", "racing"],
+        "dataviz": ["dataviz", "data visualization", "visualization", "chart", "graph"],
+        "interactive": ["interactive", "interaction", "explore"],
+        "map": ["map", "maps", "mapping", "geographic", "geo"],
+        "nyt": ["new york times", "nyt", "ny times"],
+    }
+    def __init__(self):
+        pass
+    def classify(self, user_prompt: str) -> Dict:
+        """
+        Classify user intent and extract search parameters.
+        Args:
+            user_prompt: The user's search query
+        Returns:
+            Dict with:
+                - intent: QueryIntent enum
+                - keywords: List of keywords to search
+                - tags: List of potential tags to search
+                - original_query: Original user prompt
+        """
+        prompt_lower = user_prompt.lower().strip()
+        # Detect intent
+        intent = self._detect_intent(prompt_lower)
+        # Extract keywords
+        keywords = self._extract_keywords(prompt_lower)
+        # Infer potential tags
+        tags = self._infer_tags(prompt_lower, keywords)
+        return {
+            "intent": intent,
+            "keywords": keywords,
+            "tags": tags,
+            "original_query": user_prompt,
+            "is_short_query": len(prompt_lower.split()) <= 3
+        }
+    def _detect_intent(self, prompt: str) -> QueryIntent:
+        """
+        Determine if user wants tag search, keyword search, or hybrid.
+        """
+        # Check for tag indicators
+        has_tag_indicator = any(indicator in prompt for indicator in self.TAG_INDICATORS)
+        # Short queries (1-3 words) should try hybrid approach
+        word_count = len(prompt.split())
+        if has_tag_indicator:
+            return QueryIntent.TAG
+        elif word_count <= 3:
+            # Short queries: try both tag and keyword search
+            return QueryIntent.HYBRID
+        else:
+            # Longer natural language queries: keyword search first
+            return QueryIntent.KEYWORD
+    def _extract_keywords(self, prompt: str) -> List[str]:
+        """
+        Extract meaningful keywords from the prompt.
+        """
+        # Remove common stop words
+        stop_words = {
+            "show", "me", "find", "get", "search", "for", "the", "a", "an",
+            "with", "about", "of", "in", "on", "at", "to", "from", "by",
+            "what", "where", "when", "who", "how", "is", "are", "was", "were"
+        }
+        # Split and clean
+        words = re.findall(r'\b\w+\b', prompt.lower())
+        # Allow 2-character words like "F1", "AI", "3D"
+        keywords = [w for w in words if w not in stop_words and len(w) >= 2]
+        # Expand known keywords
+        expanded_keywords = []
+        for keyword in keywords:
+            if keyword in self.KEYWORD_EXPANSIONS:
+                expanded_keywords.extend(self.KEYWORD_EXPANSIONS[keyword])
+            else:
+                expanded_keywords.append(keyword)
+        # Remove duplicates while preserving order
+        return list(dict.fromkeys(expanded_keywords))
+    def _infer_tags(self, prompt: str, keywords: List[str]) -> List[str]:
+        """
+        Infer potential tag names from keywords.
+        Since we have limited tags in the database, we map common terms
+        to likely tag names.
+        """
+        # Common tag mappings based on the database
+        tag_mappings = {
+            "f1": ["f1", "racing", "motorsport", "sports"],
+            "formula": ["f1", "racing", "motorsport"],
+            "racing": ["racing", "motorsport", "f1"],
+            "dataviz": ["dataviz", "visualization"],
+            "visualization": ["dataviz", "visualization"],
+            "interactive": ["interactive"],
+            "map": ["maps", "geographic"],
+            "maps": ["maps", "geographic"],
+            "math": ["mathematics", "statistics"],
+            "statistics": ["statistics", "mathematics"],
+            "africa": ["africa", "kenya", "tanzania"],
+            "sustainability": ["sustainability", "regreening"],
+            "documentary": ["documentary", "cinematic"],
+            "education": ["students", "researchers"],
+        }
+        inferred_tags = []
+        for keyword in keywords:
+            if keyword in tag_mappings:
+                inferred_tags.extend(tag_mappings[keyword])
+        # If no specific mapping, use the keyword as-is
+        if not inferred_tags:
+            inferred_tags = keywords[:3]  # Limit to top 3 keywords
+        # Remove duplicates
+        return list(dict.fromkeys(inferred_tags))
+    def format_for_vanna(self, classification: Dict) -> str:
+        """
+        Format the classification result for Vanna's prompt.
+        Returns a string that guides Vanna's SQL generation.
+        """
+        intent = classification["intent"]
+        keywords = classification["keywords"]
+        tags = classification["tags"]
+        if intent == QueryIntent.KEYWORD:
+            return f"""
+Search using KEYWORD approach:
+- Search terms: {', '.join(keywords)}
+- Search in: posts.title, posts.author, providers.name
+- Use ILIKE with wildcards for flexible matching
+- Do not filter by tags (most posts are not tagged yet)
+"""
+        elif intent == QueryIntent.TAG:
+            return f"""
+Search using TAG approach:
+- Tag names: {', '.join(tags)}
+- Use LOWER() for case-insensitive matching
+- Join with post_tags and tags tables
+- Note: Only a few posts are tagged, results may be limited
+"""
+        else:  # HYBRID
+            return f"""
+Search using HYBRID approach:
+- Try tags first: {', '.join(tags)}
+- Fall back to keywords: {', '.join(keywords)}
+- Use OR logic: tag matches OR keyword matches in title/author
+- This maximizes results since most posts are not tagged yet
+Recommended SQL pattern:
+SELECT DISTINCT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type
+FROM posts p
+LEFT JOIN post_tags pt ON p.id = pt.post_id
+LEFT JOIN tags t ON pt.tag_id = t.id
+LEFT JOIN providers pr ON p.provider_id = pr.id
+WHERE
+    LOWER(t.name) = ANY(ARRAY[{', '.join(f"'{tag}'" for tag in tags)}])
+    OR LOWER(p.title) LIKE ANY(ARRAY[{', '.join(f"'%{kw}%'" for kw in keywords)}])
+    OR LOWER(p.author) LIKE ANY(ARRAY[{', '.join(f"'%{kw}%'" for kw in keywords)}])
+    OR LOWER(pr.name) LIKE ANY(ARRAY[{', '.join(f"'%{kw}%'" for kw in keywords)}])
+ORDER BY p.published_date DESC NULLS LAST
+LIMIT 9
+"""
+# Convenience function
+def classify_query(user_prompt: str) -> Dict:
+    """
+    Classify a user query and return search parameters.
+    """
+    classifier = IntentClassifier()
+    return classifier.classify(user_prompt)
+# Example usage
+if __name__ == "__main__":
+    # Test cases
+    test_queries = [
+        "F1",
+        "Show me F1 content",
+        "interactive visualizations",
+        "New York Times articles",
+        "content tagged with dataviz",
+        "recent sustainability projects in Africa",
+    ]
+    classifier = IntentClassifier()
+    for query in test_queries:
+        result = classifier.classify(query)
+        print(f"\nQuery: '{query}'")
+        print(f"Intent: {result['intent'].value}")
+        print(f"Keywords: {result['keywords']}")
+        print(f"Tags: {result['tags']}")
+        print(f"Short query: {result['is_short_query']}")

src/vanna.py CHANGED Viewed

@@ -55,9 +55,6 @@ class CustomSQLSystemPromptBuilder(SystemPromptBuilder):
             "- Never use SELECT *\n"
             "- Prefer window functions over subqueries when possible\n"
             "- Always include a LIMIT for exploratory queries\n"
-            "- Exclude posts where provider = 'SND'\n"
-            "- Exclude posts where type = 'resource'\n"
-            "- Exclude posts where type = 'insight'\n"
             "- Format dates and numbers for readability\n"
         )
@@ -106,15 +103,32 @@ class CustomSQLSystemPromptBuilder(SystemPromptBuilder):
         # ======================
         prompt += (
             "\n## Business Logic\n"
-            "- Providers named 'SND' must always be excluded.\n"
             "- A query mentioning an organization (e.g., 'New York Times') should search both `posts.author` and `providers.name`.\n"
-            "- By default, only posts with `type = 'spotlight'` are returned.\n"
-            "- Posts of type `resource` or `insight` are excluded unless explicitly requested.\n"
             "- Tags link posts to specific themes or disciplines.\n"
             "- A single post may have multiple tags, awards, or categories.\n"
             "- If the user mentions a year (e.g., 'in 2021'), filter with `EXTRACT(YEAR FROM published_date) = 2021`.\n"
             "- If the user says 'recently', filter posts from the last 90 days.\n"
             "- Always limit exploratory results to 9 rows.\n"
         )
         # ======================
@@ -145,21 +159,30 @@ class CustomSQLSystemPromptBuilder(SystemPromptBuilder):
         # ======================
         prompt += (
             "\n## Example Interactions\n"
-            "User: 'Show me posts related to 3D'\n"
-            "Assistant: [call run_sql with \"SELECT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
             "FROM posts p "
-            "JOIN post_tags pt ON p.id = pt.post_id "
-            "JOIN tags t ON pt.tag_id = t.id "
-            "JOIN providers pr ON p.provider_id = pr.id "
-            "WHERE t.name ILIKE '%3D%' AND pr.name != 'SND' AND p.type = 'spotlight' "
-            "LIMIT 9;\"]\n"
             "\nUser: 'Show me posts from The New York Times'\n"
-            "Assistant: [call run_sql with \"SELECT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
             "FROM posts p "
-            "LEFT JOIN providers pr ON pr.id = p.provider_id "
-            "WHERE LOWER(p.author) LIKE '%new york times%' OR LOWER(pr.name) LIKE '%new york times%' "
-            "AND pr.name != 'SND' AND p.type = 'spotlight' "
-            "LIMIT 9;\"]\n"
         )
         # ======================
@@ -167,8 +190,6 @@ class CustomSQLSystemPromptBuilder(SystemPromptBuilder):
         # ======================
         prompt += (
             "\nIMPORTANT:\n"
-            "- Always exclude posts with provider = 'SND'.\n"
-            "- Always exclude posts with type = 'resource' or 'insight'.\n"
             "- Always return **only the raw CSV result** — no explanations, no JSON, no commentary.\n"
             "- Stop tool execution once the query result is obtained.\n"
         )
@@ -197,8 +218,8 @@ class VannaComponent:
         db_tool = RunSqlTool(sql_runner=self.sql_runner)
         agent_memory = DemoAgentMemory(max_items=1000)
-        save_memory_tool = SaveQuestionToolArgsTool(agent_memory)
-        search_memory_tool = SearchSavedCorrectToolUsesTool(agent_memory)
         self.user_resolver = SimpleUserResolver()
@@ -211,32 +232,46 @@ class VannaComponent:
             llm_service=llm,
             tool_registry=tools,
             user_resolver=self.user_resolver,
             system_prompt_builder=CustomSQLSystemPromptBuilder("CoJournalist", self.sql_runner),
-            config=AgentConfig(stream_responses=False, max_tool_iterations=1)
         )
     async def ask(self, prompt_for_llm: str):
         ctx = RequestContext()
-        print(f"🙋 Prompt sent to LLM: {prompt_for_llm}")
         final_text = ""
         seen_texts = set()
         async for component in self.agent.send_message(request_context=ctx, message=prompt_for_llm):
             simple = getattr(component, "simple_component", None)
             text = getattr(simple, "text", "") if simple else ""
             if text and text not in seen_texts:
-                print(f"💬 LLM says (part): {text[:200]}...")
                 final_text += text + "\n"
                 seen_texts.add(text)
             sql_query = getattr(component, "sql", None)
             if sql_query:
-                print(f"🧾 SQL Query Generated: {sql_query}")
             metadata = getattr(component, "metadata", None)
             if metadata:
-                print(f"📋 Metadata: {metadata}")
             component_type = getattr(component, "type", None)
             if component_type:
@@ -245,16 +280,36 @@ class VannaComponent:
             match = re.search(r"query_results_[\w-]+\.csv", final_text)
             if match:
                 filename = match.group(0)
-                folder = "513935c4d2db2d2d"
                 full_path = os.path.join(folder, filename)
                 if os.path.exists(full_path):
-                    print(f"📂 Reading result file: {full_path}")
                     with open(full_path, "r", encoding="utf-8") as f:
                         csv_data = f.read().strip()
-                    print("🤖 Response sent to user (from file):", csv_data[:300])
                     return csv_data
                 else:
-                    print(f"⚠️ File not found: {full_path}")
         return final_text

             "- Never use SELECT *\n"
             "- Prefer window functions over subqueries when possible\n"
             "- Always include a LIMIT for exploratory queries\n"
             "- Format dates and numbers for readability\n"
         )
         # ======================
         prompt += (
             "\n## Business Logic\n"
             "- A query mentioning an organization (e.g., 'New York Times') should search both `posts.author` and `providers.name`.\n"
+            "- Return all post types (spotlight, resource, insight) unless the user specifies otherwise.\n"
             "- Tags link posts to specific themes or disciplines.\n"
             "- A single post may have multiple tags, awards, or categories.\n"
             "- If the user mentions a year (e.g., 'in 2021'), filter with `EXTRACT(YEAR FROM published_date) = 2021`.\n"
             "- If the user says 'recently', filter posts from the last 90 days.\n"
             "- Always limit exploratory results to 9 rows.\n"
+            "\n"
+            "## CRITICAL: Search Strategy\n"
+            "**IMPORTANT**: Only 3 posts currently have tags. Most posts (7,245+) are NOT tagged yet.\n"
+            "\n"
+            "**Hybrid Search Approach (RECOMMENDED)**:\n"
+            "- ALWAYS use a hybrid approach combining tag search AND keyword search with OR logic.\n"
+            "- Use LEFT JOINs for tags (not INNER JOIN) so untagged posts are included.\n"
+            "\n"
+            "**Keyword Matching - Use PostgreSQL Regex for Exact Word Boundaries**:\n"
+            "- Use ~* operator for case-insensitive regex matching\n"
+            "- Use \\m and \\M for word boundaries (start and end of word)\n"
+            "- Pattern: column ~* '\\\\mkeyword\\\\M'\n"
+            "- Example: p.title ~* '\\\\mf1\\\\M' matches 'F1' but NOT 'profile' or 'if'\n"
+            "- This ensures exact word matching, not substring matching\n"
+            "\n"
+            "**When to use tag-only search**: Only if user explicitly mentions 'tagged with' or 'tag:'.\n"
+            "**When to use keyword-only search**: For author/organization names, or when tags are not relevant.\n"
+            "\n"
+            "This ensures maximum result coverage while the database is being enriched with tags.\n"
         )
         # ======================
         # ======================
         prompt += (
             "\n## Example Interactions\n"
+            "User: 'F1' or 'Show me F1 content'\n"
+            "Assistant: [call run_sql with \"SELECT DISTINCT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
             "FROM posts p "
+            "LEFT JOIN post_tags pt ON p.id = pt.post_id "
+            "LEFT JOIN tags t ON pt.tag_id = t.id "
+            "LEFT JOIN providers pr ON p.provider_id = pr.id "
+            "WHERE t.name ~* '\\\\mf1\\\\M' OR t.name ~* '\\\\mformula\\\\M' "
+            "OR p.title ~* '\\\\mf1\\\\M' OR p.title ~* '\\\\mformula\\\\M' "
+            "OR p.author ~* '\\\\mf1\\\\M' "
+            "ORDER BY p.published_date DESC NULLS LAST LIMIT 9;\"]\n"
             "\nUser: 'Show me posts from The New York Times'\n"
+            "Assistant: [call run_sql with \"SELECT DISTINCT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
             "FROM posts p "
+            "LEFT JOIN providers pr ON p.provider_id = pr.id "
+            "WHERE p.author ~* '\\\\mnew\\\\M.*\\\\myork\\\\M.*\\\\mtimes\\\\M' OR pr.name ~* '\\\\mnew\\\\M.*\\\\myork\\\\M.*\\\\mtimes\\\\M' "
+            "ORDER BY p.published_date DESC NULLS LAST LIMIT 9;\"]\n"
+            "\nUser: 'interactive visualizations'\n"
+            "Assistant: [call run_sql with \"SELECT DISTINCT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
+            "FROM posts p "
+            "LEFT JOIN post_tags pt ON p.id = pt.post_id "
+            "LEFT JOIN tags t ON pt.tag_id = t.id "
+            "WHERE t.name ~* '\\\\minteractive\\\\M' OR p.title ~* '\\\\minteractive\\\\M' "
+            "OR p.title ~* '\\\\mvisualization\\\\M' OR t.name ~* '\\\\mdataviz\\\\M' "
+            "ORDER BY p.published_date DESC NULLS LAST LIMIT 9;\"]\n"
         )
         # ======================
         # ======================
         prompt += (
             "\nIMPORTANT:\n"
             "- Always return **only the raw CSV result** — no explanations, no JSON, no commentary.\n"
             "- Stop tool execution once the query result is obtained.\n"
         )
         db_tool = RunSqlTool(sql_runner=self.sql_runner)
         agent_memory = DemoAgentMemory(max_items=1000)
+        save_memory_tool = SaveQuestionToolArgsTool()
+        search_memory_tool = SearchSavedCorrectToolUsesTool()
         self.user_resolver = SimpleUserResolver()
             llm_service=llm,
             tool_registry=tools,
             user_resolver=self.user_resolver,
+            agent_memory=agent_memory,
             system_prompt_builder=CustomSQLSystemPromptBuilder("CoJournalist", self.sql_runner),
+            config=AgentConfig(stream_responses=False, max_tool_iterations=3)
         )
     async def ask(self, prompt_for_llm: str):
         ctx = RequestContext()
+        print(f"\n{'='*80}")
+        print(f"🙋 User Query: {prompt_for_llm}")
+        print(f"{'='*80}\n")
         final_text = ""
         seen_texts = set()
+        query_executed = False
+        result_row_count = 0
         async for component in self.agent.send_message(request_context=ctx, message=prompt_for_llm):
             simple = getattr(component, "simple_component", None)
             text = getattr(simple, "text", "") if simple else ""
             if text and text not in seen_texts:
+                print(f"💬 LLM Response: {text[:300]}...")
                 final_text += text + "\n"
                 seen_texts.add(text)
             sql_query = getattr(component, "sql", None)
             if sql_query:
+                query_executed = True
+                print(f"\n🧾 SQL Query Generated:")
+                print(f"{'-'*80}")
+                print(f"{sql_query}")
+                print(f"{'-'*80}\n")
             metadata = getattr(component, "metadata", None)
             if metadata:
+                print(f"📋 Query Metadata: {metadata}")
+                result_row_count = metadata.get("row_count", 0)
+                if result_row_count == 0:
+                    print(f"⚠️  Query returned 0 rows - no data matched the criteria")
+                else:
+                    print(f"✅ Query returned {result_row_count} rows")
             component_type = getattr(component, "type", None)
             if component_type:
             match = re.search(r"query_results_[\w-]+\.csv", final_text)
             if match:
                 filename = match.group(0)
+                # Calculate the user-specific folder based on the default user ID
+                import hashlib
+                user_hash = hashlib.sha256("[email protected]".encode()).hexdigest()[:16]
+                folder = user_hash
                 full_path = os.path.join(folder, filename)
+                print(f"\n📁 Looking for CSV file: {full_path}")
+                # Create folder if it doesn't exist
+                if not os.path.exists(folder):
+                    print(f"📂 Creating user directory: {folder}")
+                    os.makedirs(folder, exist_ok=True)
                 if os.path.exists(full_path):
+                    print(f"✅ Found CSV file, reading contents...")
                     with open(full_path, "r", encoding="utf-8") as f:
                         csv_data = f.read().strip()
+                    print(f"📊 CSV Data Preview: {csv_data[:200]}...")
+                    print(f"{'='*80}\n")
                     return csv_data
                 else:
+                    print(f"❌ CSV file not found at: {full_path}")
+                    # List files in the directory to help debug
+                    if os.path.exists(folder):
+                        files = os.listdir(folder)
+                        print(f"📂 Files in {folder}: {files}")
+        print(f"\n{'='*80}")
+        if not query_executed:
+            print(f"⚠️  No SQL query was executed by the LLM")
+        print(f"📤 Returning final response to user")
+        print(f"{'='*80}\n")
         return final_text

src/vanna_query_functions.py ADDED Viewed

	@@ -0,0 +1,300 @@

+"""
+Vanna Query Function Templates
+Defines SQL templates for different search strategies.
+These are used by Vanna to generate accurate, performant SQL queries.
+"""
+from typing import Dict, List
+class QueryFunctions:
+    """
+    Collection of SQL query templates for different search strategies.
+    """
+    @staticmethod
+    def keyword_search(keywords: List[str], limit: int = 9) -> str:
+        """
+        Full-text keyword search across title, author, and provider.
+        Works for all posts in the database (7,248 posts).
+        Args:
+            keywords: List of keywords to search for
+            limit: Maximum number of results
+        Returns:
+            SQL query string
+        """
+        # Build regex conditions for each keyword with word boundaries
+        # Use PostgreSQL ~* operator for case-insensitive regex matching
+        # \m and \M are word boundary markers (start/end of word)
+        keyword_conditions = []
+        for keyword in keywords:
+            keyword_lower = keyword.lower()
+            # Escape special regex characters
+            keyword_escaped = keyword_lower.replace('\\', '\\\\').replace('.', '\\.').replace('+', '\\+')
+            keyword_conditions.append(f"""
+                (p.title ~* '\\m{keyword_escaped}\\M'
+                OR p.author ~* '\\m{keyword_escaped}\\M'
+                OR pr.name ~* '\\m{keyword_escaped}\\M')
+            """)
+        where_clause = " OR ".join(keyword_conditions)
+        return f"""
+SELECT DISTINCT
+    p.id,
+    p.title,
+    p.source_url,
+    p.author,
+    p.published_date,
+    p.image_url,
+    p.type,
+    pr.name as provider_name
+FROM posts p
+LEFT JOIN providers pr ON p.provider_id = pr.id
+WHERE {where_clause}
+ORDER BY p.published_date DESC NULLS LAST
+LIMIT {limit};
+        """
+    @staticmethod
+    def tag_search(tags: List[str], limit: int = 9) -> str:
+        """
+        Tag-based search.
+        Currently works for only 3 posts with tags.
+        As more posts are tagged, this will return more results.
+        Args:
+            tags: List of tag names to search for
+            limit: Maximum number of results
+        Returns:
+            SQL query string
+        """
+        # Format tag array for SQL
+        tags_lower = [f"'{tag.lower()}'" for tag in tags]
+        tags_array = f"ARRAY[{', '.join(tags_lower)}]"
+        return f"""
+SELECT DISTINCT
+    p.id,
+    p.title,
+    p.source_url,
+    p.author,
+    p.published_date,
+    p.image_url,
+    p.type,
+    pr.name as provider_name,
+    string_agg(DISTINCT t.name, ', ') as tags
+FROM posts p
+JOIN post_tags pt ON p.id = pt.post_id
+JOIN tags t ON pt.tag_id = t.id
+LEFT JOIN providers pr ON p.provider_id = pr.id
+WHERE LOWER(t.name) = ANY({tags_array})
+GROUP BY p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type, pr.name
+ORDER BY p.published_date DESC NULLS LAST
+LIMIT {limit};
+        """
+    @staticmethod
+    def hybrid_search(keywords: List[str], tags: List[str], limit: int = 9) -> str:
+        """
+        Hybrid search combining tags AND keywords.
+        Best of both worlds:
+        - Finds tagged posts (currently 3)
+        - Falls back to keyword search for untagged posts (7,245)
+        Args:
+            keywords: List of keywords to search for
+            tags: List of tag names to search for
+            limit: Maximum number of results
+        Returns:
+            SQL query string
+        """
+        # Build tag conditions
+        tags_lower = [f"'{tag.lower()}'" for tag in tags]
+        tags_array = f"ARRAY[{', '.join(tags_lower)}]"
+        # Build regex keyword conditions with word boundaries
+        keyword_conditions = []
+        for keyword in keywords:
+            keyword_lower = keyword.lower()
+            # Escape special regex characters
+            keyword_escaped = keyword_lower.replace('\\', '\\\\').replace('.', '\\.').replace('+', '\\+')
+            keyword_conditions.append(f"""
+                (p.title ~* '\\m{keyword_escaped}\\M'
+                OR p.author ~* '\\m{keyword_escaped}\\M'
+                OR pr.name ~* '\\m{keyword_escaped}\\M')
+            """)
+        keyword_where = " OR ".join(keyword_conditions)
+        return f"""
+SELECT DISTINCT
+    p.id,
+    p.title,
+    p.source_url,
+    p.author,
+    p.published_date,
+    p.image_url,
+    p.type,
+    pr.name as provider_name,
+    string_agg(DISTINCT t.name, ', ') as tags
+FROM posts p
+LEFT JOIN post_tags pt ON p.id = pt.post_id
+LEFT JOIN tags t ON pt.tag_id = t.id
+LEFT JOIN providers pr ON p.provider_id = pr.id
+WHERE
+    LOWER(t.name) = ANY({tags_array})
+    OR ({keyword_where})
+GROUP BY p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type, pr.name
+ORDER BY p.published_date DESC NULLS LAST
+LIMIT {limit};
+        """
+    @staticmethod
+    def search_by_author(author: str, limit: int = 9) -> str:
+        """
+        Search posts by specific author or organization.
+        Args:
+            author: Author name to search for
+            limit: Maximum number of results
+        Returns:
+            SQL query string
+        """
+        # Escape special regex characters
+        author_escaped = author.lower().replace('\\', '\\\\').replace('.', '\\.').replace('+', '\\+')
+        return f"""
+SELECT DISTINCT
+    p.id,
+    p.title,
+    p.source_url,
+    p.author,
+    p.published_date,
+    p.image_url,
+    p.type,
+    pr.name as provider_name
+FROM posts p
+LEFT JOIN providers pr ON p.provider_id = pr.id
+WHERE
+    p.author ~* '\\m{author_escaped}\\M'
+    OR pr.name ~* '\\m{author_escaped}\\M'
+ORDER BY p.published_date DESC NULLS LAST
+LIMIT {limit};
+        """
+    @staticmethod
+    def search_recent(days: int = 90, limit: int = 9) -> str:
+        """
+        Search for recent posts within the last N days.
+        Args:
+            days: Number of days to look back
+            limit: Maximum number of results
+        Returns:
+            SQL query string
+        """
+        return f"""
+SELECT DISTINCT
+    p.id,
+    p.title,
+    p.source_url,
+    p.author,
+    p.published_date,
+    p.image_url,
+    p.type,
+    pr.name as provider_name
+FROM posts p
+LEFT JOIN providers pr ON p.provider_id = pr.id
+WHERE
+    p.published_date >= CURRENT_DATE - INTERVAL '{days} days'
+ORDER BY p.published_date DESC
+LIMIT {limit};
+        """
+    @staticmethod
+    def search_by_type(post_type: str, limit: int = 9) -> str:
+        """
+        Search by post type (spotlight, insight, resource).
+        Args:
+            post_type: Type of post (spotlight, insight, resource)
+            limit: Maximum number of results
+        Returns:
+            SQL query string
+        """
+        return f"""
+SELECT DISTINCT
+    p.id,
+    p.title,
+    p.source_url,
+    p.author,
+    p.published_date,
+    p.image_url,
+    p.type,
+    pr.name as provider_name
+FROM posts p
+LEFT JOIN providers pr ON p.provider_id = pr.id
+WHERE p.type = '{post_type}'
+ORDER BY p.published_date DESC NULLS LAST
+LIMIT {limit};
+        """
+def generate_query(search_type: str, **kwargs) -> str:
+    """
+    Generate SQL query based on search type.
+    Args:
+        search_type: Type of search (keyword, tag, hybrid, author, recent, type)
+        **kwargs: Parameters for the specific search type
+    Returns:
+        SQL query string
+    """
+    functions = {
+        "keyword": QueryFunctions.keyword_search,
+        "tag": QueryFunctions.tag_search,
+        "hybrid": QueryFunctions.hybrid_search,
+        "author": QueryFunctions.search_by_author,
+        "recent": QueryFunctions.search_recent,
+        "type": QueryFunctions.search_by_type,
+    }
+    if search_type not in functions:
+        raise ValueError(f"Unknown search type: {search_type}")
+    return functions[search_type](**kwargs)
+# Example usage
+if __name__ == "__main__":
+    # Test keyword search
+    print("=== KEYWORD SEARCH ===")
+    print(QueryFunctions.keyword_search(["F1", "racing"]))
+    print("\n=== TAG SEARCH ===")
+    print(QueryFunctions.tag_search(["dataviz", "interactive"]))
+    print("\n=== HYBRID SEARCH ===")
+    print(QueryFunctions.hybrid_search(
+        keywords=["visualization"],
+        tags=["dataviz", "interactive"]
+    ))
+    print("\n=== AUTHOR SEARCH ===")
+    print(QueryFunctions.search_by_author("New York Times"))
+    print("\n=== RECENT POSTS ===")
+    print(QueryFunctions.search_recent(days=30))