Spaces:

edouardlgp
/

Rag_with_Pleias

Runtime error

App Files Files Community

edouardlgp commited on May 10

Commit

8cfa8f7

verified ·

1 Parent(s): 88e0a23

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -39

app.py CHANGED Viewed

@@ -2,18 +2,39 @@ import gradio as gr
 import requests
 import fitz  # PyMuPDF
 import os
 from huggingface_hub import snapshot_download
 from pleias_rag_interface import RAGWithCitations
 # Pre-download the model at build time
 MODEL_REPO = "PleIAs/Pleias-RAG-350M"
 MODEL_CACHE_DIR = "./pleias_model"
 if not os.path.exists(MODEL_CACHE_DIR):
     snapshot_download(repo_id=MODEL_REPO, local_dir=MODEL_CACHE_DIR)
 # Initialize the Pleias RAG model
-rag = RAGWithCitations(model_path_or_name=MODEL_CACHE_DIR)
 # Patch tokenizer and model for generation consistency
 if hasattr(rag, "tokenizer"):
@@ -21,61 +42,116 @@ if hasattr(rag, "tokenizer"):
 if hasattr(rag, "model"):
     rag.model.config.pad_token_id = rag.tokenizer.eos_token_id
     rag.model.generation_config.do_sample = True  # Ensure consistency with top_p
 def extract_text_from_pdf_url(url):
     try:
-        response = requests.get(url, timeout=10)  # Added timeout
         response.raise_for_status()
         doc = fitz.open(stream=response.content, filetype="pdf")
         text = ""
         for page in doc:
             text += page.get_text()
         return text.strip()
     except Exception as e:
-        return f"[Error loading PDF: {str(e)}]"
 def generate_answer(query, pdf_urls_str):
-    if not query or not pdf_urls_str:  # Added input validation
-        return "Please provide both a question and at least one PDF URL"
-    pdf_urls = [url.strip() for url in pdf_urls_str.strip().split("\n") if url.strip()]
-    sources = []
-    feedback = "### PDF Load Report:\n"
-    for url in pdf_urls:
         try:
-            text = extract_text_from_pdf_url(url)
-            if not text.startswith("[Error"):
-                sources.append({
-                    "text": text,
-                    "metadata": {"source": url}
-                })
-                feedback += f"- ✅ Loaded: {url[:80]}\n"
-            else:
-                feedback += f"- ❌ Failed: {url[:80]}\n"
         except Exception as e:
-            feedback += f"- ❌ Error processing {url[:80]}: {str(e)}\n"
-    if not sources:
-        return feedback + "\n❌ No valid PDFs were successfully processed."
-    try:
-        response = rag.generate(query, sources)
-        return feedback + f"\n\n### Answer:\n{response['raw_response']}\n\n_Backend used: {response['backend_used']}_"
     except Exception as e:
-        return feedback + f"\n\n❌ Error generating answer: {str(e)}"
-iface = gr.Interface(
-    fn=generate_answer,
-    inputs=[
-        gr.Textbox(label="Your Question", placeholder="What is this document about?"),
-        gr.Textbox(lines=5, label="PDF URLs (one per line)", placeholder="https://example.com/doc1.pdf\nhttps://example.com/doc2.pdf")
-    ],
-    outputs=gr.Markdown(label="Model Response"),
-    title="Pleias RAG PDF QA",
-    description="Ask a question and get answers grounded in the content of the uploaded PDF URLs using the Pleias RAG model.",
-    allow_flagging="never"  # Disable flagging to simplify interface
-)
 if __name__ == "__main__":
-    iface.launch(server_port=7860, server_name="0.0.0.0", show_error=True)  # Added explicit server settings

 import requests
 import fitz  # PyMuPDF
 import os
+import time
+import traceback
 from huggingface_hub import snapshot_download
 from pleias_rag_interface import RAGWithCitations
+# Debugging setup
+DEBUG = True
+debug_log = []
+def log_debug(message):
+    if DEBUG:
+        timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
+        debug_log.append(f"[{timestamp}] {message}")
+        print(debug_log[-1])  # Also print to console
 # Pre-download the model at build time
 MODEL_REPO = "PleIAs/Pleias-RAG-350M"
 MODEL_CACHE_DIR = "./pleias_model"
+log_debug("Starting application initialization...")
 if not os.path.exists(MODEL_CACHE_DIR):
+    log_debug("Downloading model...")
     snapshot_download(repo_id=MODEL_REPO, local_dir=MODEL_CACHE_DIR)
 # Initialize the Pleias RAG model
+log_debug("Initializing RAG model...")
+try:
+    rag = RAGWithCitations(model_path_or_name=MODEL_CACHE_DIR)
+    log_debug("Model initialized successfully")
+except Exception as e:
+    log_debug(f"Model initialization failed: {str(e)}")
+    raise
 # Patch tokenizer and model for generation consistency
 if hasattr(rag, "tokenizer"):
 if hasattr(rag, "model"):
     rag.model.config.pad_token_id = rag.tokenizer.eos_token_id
     rag.model.generation_config.do_sample = True  # Ensure consistency with top_p
+    # Fix the warning about attention mask
+    rag.model.config.use_cache = True
 def extract_text_from_pdf_url(url):
     try:
+        log_debug(f"Fetching PDF from URL: {url}")
+        response = requests.get(url, timeout=20)
         response.raise_for_status()
         doc = fitz.open(stream=response.content, filetype="pdf")
         text = ""
         for page in doc:
             text += page.get_text()
+        log_debug(f"Successfully extracted text from PDF (length: {len(text)})")
         return text.strip()
     except Exception as e:
+        error_msg = f"[Error loading PDF: {str(e)}]"
+        log_debug(error_msg)
+        return error_msg
 def generate_answer(query, pdf_urls_str):
+    debug_output = "### Debug Console:\n"
+    try:
+        if not query or not pdf_urls_str:
+            debug_output += "❌ Error: Missing question or PDF URLs\n"
+            return debug_output + "Please provide both a question and at least one PDF URL"
+        log_debug(f"Question: {query}")
+        log_debug(f"PDF URLs: {pdf_urls_str}")
+        pdf_urls = [url.strip() for url in pdf_urls_str.strip().split("\n") if url.strip()]
+        sources = []
+        feedback = "### PDF Load Report:\n"
+        for url in pdf_urls:
+            try:
+                text = extract_text_from_pdf_url(url)
+                if not text.startswith("[Error"):
+                    sources.append({
+                        "text": text,
+                        "metadata": {"source": url}
+                    })
+                    feedback += f"- ✅ Loaded: {url[:80]}\n"
+                else:
+                    feedback += f"- ❌ Failed: {url[:80]}\n"
+            except Exception as e:
+                error_msg = f"- ❌ Error processing {url[:80]}: {str(e)}\n"
+                feedback += error_msg
+                log_debug(error_msg.strip())
+        if not sources:
+            debug_output += "❌ No valid PDFs processed\n"
+            return debug_output + feedback + "\n❌ No valid PDFs were successfully processed."
+        log_debug(f"Generating answer with {len(sources)} sources...")
         try:
+            response = rag.generate(query, sources)
+            answer = response.get('raw_response', 'No response generated')
+            backend = response.get('backend_used', 'unknown backend')
+            debug_output += f"✅ Generation completed (backend: {backend})\n"
+            return debug_output + feedback + f"\n\n### Answer:\n{answer}\n\n_Backend used: {backend}_"
         except Exception as e:
+            error_msg = f"❌ Generation failed: {str(e)}\n{traceback.format_exc()}"
+            log_debug(error_msg)
+            debug_output += error_msg
+            return debug_output + feedback + f"\n\n❌ Error generating answer: {str(e)}"
     except Exception as e:
+        error_msg = f"❌ Unexpected error: {str(e)}\n{traceback.format_exc()}"
+        log_debug(error_msg)
+        return debug_output + error_msg
+# Create a separate debug console component
+debug_console = gr.Textbox(label="Debug Console", interactive=False, visible=DEBUG)
+def get_debug_log():
+    return "\n".join(debug_log[-10:]) if debug_log else "No debug messages yet"
+with gr.Blocks() as demo:
+    gr.Markdown("## Retrieval Generation from PDF files with a pocket size 350 Model from Pleias")
+    gr.Markdown("Ask a question and get answers grounded in the content of the uploaded PDF URLs")
+    with gr.Row():
+        with gr.Column():
+            question = gr.Textbox(label="Your Question", placeholder="What is this document about?")
+            pdf_urls = gr.Textbox(lines=5, label="PDF URLs (one per line)",
+                                placeholder="https://example.com/doc1.pdf\nhttps://example.com/doc2.pdf")
+            submit_btn = gr.Button("Submit", variant="primary")
+        with gr.Column():
+            output = gr.Markdown(label="Model Response")
+            if DEBUG:
+                debug_console.render()
+                gr.Button("Refresh Debug").click(
+                    fn=get_debug_log,
+                    outputs=debug_console
+                )
+    submit_btn.click(
+        fn=generate_answer,
+        inputs=[question, pdf_urls],
+        outputs=output
+    )
 if __name__ == "__main__":
+    log_debug("Launching interface...")
+    demo.launch(
+        server_port=7860,
+        server_name="0.0.0.0",
+        show_error=True,
+        debug=DEBUG,
+        ssl_verify=False  # In case there are HTTPS issues with PDF URLs
+    )