edouardlgp commited on
Commit
8cfa8f7
Β·
verified Β·
1 Parent(s): 88e0a23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -39
app.py CHANGED
@@ -2,18 +2,39 @@ import gradio as gr
2
  import requests
3
  import fitz # PyMuPDF
4
  import os
 
 
5
  from huggingface_hub import snapshot_download
6
  from pleias_rag_interface import RAGWithCitations
7
 
 
 
 
 
 
 
 
 
 
 
8
  # Pre-download the model at build time
9
  MODEL_REPO = "PleIAs/Pleias-RAG-350M"
10
  MODEL_CACHE_DIR = "./pleias_model"
11
 
 
 
12
  if not os.path.exists(MODEL_CACHE_DIR):
 
13
  snapshot_download(repo_id=MODEL_REPO, local_dir=MODEL_CACHE_DIR)
14
 
15
  # Initialize the Pleias RAG model
16
- rag = RAGWithCitations(model_path_or_name=MODEL_CACHE_DIR)
 
 
 
 
 
 
17
 
18
  # Patch tokenizer and model for generation consistency
19
  if hasattr(rag, "tokenizer"):
@@ -21,61 +42,116 @@ if hasattr(rag, "tokenizer"):
21
  if hasattr(rag, "model"):
22
  rag.model.config.pad_token_id = rag.tokenizer.eos_token_id
23
  rag.model.generation_config.do_sample = True # Ensure consistency with top_p
 
 
24
 
25
  def extract_text_from_pdf_url(url):
26
  try:
27
- response = requests.get(url, timeout=10) # Added timeout
 
28
  response.raise_for_status()
29
  doc = fitz.open(stream=response.content, filetype="pdf")
30
  text = ""
31
  for page in doc:
32
  text += page.get_text()
 
33
  return text.strip()
34
  except Exception as e:
35
- return f"[Error loading PDF: {str(e)}]"
 
 
36
 
37
  def generate_answer(query, pdf_urls_str):
38
- if not query or not pdf_urls_str: # Added input validation
39
- return "Please provide both a question and at least one PDF URL"
 
 
 
40
 
41
- pdf_urls = [url.strip() for url in pdf_urls_str.strip().split("\n") if url.strip()]
42
- sources = []
43
- feedback = "### PDF Load Report:\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
- for url in pdf_urls:
46
  try:
47
- text = extract_text_from_pdf_url(url)
48
- if not text.startswith("[Error"):
49
- sources.append({
50
- "text": text,
51
- "metadata": {"source": url}
52
- })
53
- feedback += f"- βœ… Loaded: {url[:80]}\n"
54
- else:
55
- feedback += f"- ❌ Failed: {url[:80]}\n"
56
  except Exception as e:
57
- feedback += f"- ❌ Error processing {url[:80]}: {str(e)}\n"
 
 
 
58
 
59
- if not sources:
60
- return feedback + "\n❌ No valid PDFs were successfully processed."
61
-
62
- try:
63
- response = rag.generate(query, sources)
64
- return feedback + f"\n\n### Answer:\n{response['raw_response']}\n\n_Backend used: {response['backend_used']}_"
65
  except Exception as e:
66
- return feedback + f"\n\n❌ Error generating answer: {str(e)}"
67
-
68
- iface = gr.Interface(
69
- fn=generate_answer,
70
- inputs=[
71
- gr.Textbox(label="Your Question", placeholder="What is this document about?"),
72
- gr.Textbox(lines=5, label="PDF URLs (one per line)", placeholder="https://example.com/doc1.pdf\nhttps://example.com/doc2.pdf")
73
- ],
74
- outputs=gr.Markdown(label="Model Response"),
75
- title="Pleias RAG PDF QA",
76
- description="Ask a question and get answers grounded in the content of the uploaded PDF URLs using the Pleias RAG model.",
77
- allow_flagging="never" # Disable flagging to simplify interface
78
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
  if __name__ == "__main__":
81
- iface.launch(server_port=7860, server_name="0.0.0.0", show_error=True) # Added explicit server settings
 
 
 
 
 
 
 
 
2
  import requests
3
  import fitz # PyMuPDF
4
  import os
5
+ import time
6
+ import traceback
7
  from huggingface_hub import snapshot_download
8
  from pleias_rag_interface import RAGWithCitations
9
 
10
+ # Debugging setup
11
+ DEBUG = True
12
+ debug_log = []
13
+
14
+ def log_debug(message):
15
+ if DEBUG:
16
+ timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
17
+ debug_log.append(f"[{timestamp}] {message}")
18
+ print(debug_log[-1]) # Also print to console
19
+
20
  # Pre-download the model at build time
21
  MODEL_REPO = "PleIAs/Pleias-RAG-350M"
22
  MODEL_CACHE_DIR = "./pleias_model"
23
 
24
+ log_debug("Starting application initialization...")
25
+
26
  if not os.path.exists(MODEL_CACHE_DIR):
27
+ log_debug("Downloading model...")
28
  snapshot_download(repo_id=MODEL_REPO, local_dir=MODEL_CACHE_DIR)
29
 
30
  # Initialize the Pleias RAG model
31
+ log_debug("Initializing RAG model...")
32
+ try:
33
+ rag = RAGWithCitations(model_path_or_name=MODEL_CACHE_DIR)
34
+ log_debug("Model initialized successfully")
35
+ except Exception as e:
36
+ log_debug(f"Model initialization failed: {str(e)}")
37
+ raise
38
 
39
  # Patch tokenizer and model for generation consistency
40
  if hasattr(rag, "tokenizer"):
 
42
  if hasattr(rag, "model"):
43
  rag.model.config.pad_token_id = rag.tokenizer.eos_token_id
44
  rag.model.generation_config.do_sample = True # Ensure consistency with top_p
45
+ # Fix the warning about attention mask
46
+ rag.model.config.use_cache = True
47
 
48
  def extract_text_from_pdf_url(url):
49
  try:
50
+ log_debug(f"Fetching PDF from URL: {url}")
51
+ response = requests.get(url, timeout=20)
52
  response.raise_for_status()
53
  doc = fitz.open(stream=response.content, filetype="pdf")
54
  text = ""
55
  for page in doc:
56
  text += page.get_text()
57
+ log_debug(f"Successfully extracted text from PDF (length: {len(text)})")
58
  return text.strip()
59
  except Exception as e:
60
+ error_msg = f"[Error loading PDF: {str(e)}]"
61
+ log_debug(error_msg)
62
+ return error_msg
63
 
64
  def generate_answer(query, pdf_urls_str):
65
+ debug_output = "### Debug Console:\n"
66
+ try:
67
+ if not query or not pdf_urls_str:
68
+ debug_output += "❌ Error: Missing question or PDF URLs\n"
69
+ return debug_output + "Please provide both a question and at least one PDF URL"
70
 
71
+ log_debug(f"Question: {query}")
72
+ log_debug(f"PDF URLs: {pdf_urls_str}")
73
+
74
+ pdf_urls = [url.strip() for url in pdf_urls_str.strip().split("\n") if url.strip()]
75
+ sources = []
76
+ feedback = "### PDF Load Report:\n"
77
+
78
+ for url in pdf_urls:
79
+ try:
80
+ text = extract_text_from_pdf_url(url)
81
+ if not text.startswith("[Error"):
82
+ sources.append({
83
+ "text": text,
84
+ "metadata": {"source": url}
85
+ })
86
+ feedback += f"- βœ… Loaded: {url[:80]}\n"
87
+ else:
88
+ feedback += f"- ❌ Failed: {url[:80]}\n"
89
+ except Exception as e:
90
+ error_msg = f"- ❌ Error processing {url[:80]}: {str(e)}\n"
91
+ feedback += error_msg
92
+ log_debug(error_msg.strip())
93
+
94
+ if not sources:
95
+ debug_output += "❌ No valid PDFs processed\n"
96
+ return debug_output + feedback + "\n❌ No valid PDFs were successfully processed."
97
 
98
+ log_debug(f"Generating answer with {len(sources)} sources...")
99
  try:
100
+ response = rag.generate(query, sources)
101
+ answer = response.get('raw_response', 'No response generated')
102
+ backend = response.get('backend_used', 'unknown backend')
103
+
104
+ debug_output += f"βœ… Generation completed (backend: {backend})\n"
105
+ return debug_output + feedback + f"\n\n### Answer:\n{answer}\n\n_Backend used: {backend}_"
 
 
 
106
  except Exception as e:
107
+ error_msg = f"❌ Generation failed: {str(e)}\n{traceback.format_exc()}"
108
+ log_debug(error_msg)
109
+ debug_output += error_msg
110
+ return debug_output + feedback + f"\n\n❌ Error generating answer: {str(e)}"
111
 
 
 
 
 
 
 
112
  except Exception as e:
113
+ error_msg = f"❌ Unexpected error: {str(e)}\n{traceback.format_exc()}"
114
+ log_debug(error_msg)
115
+ return debug_output + error_msg
116
+
117
+ # Create a separate debug console component
118
+ debug_console = gr.Textbox(label="Debug Console", interactive=False, visible=DEBUG)
119
+
120
+ def get_debug_log():
121
+ return "\n".join(debug_log[-10:]) if debug_log else "No debug messages yet"
122
+
123
+ with gr.Blocks() as demo:
124
+ gr.Markdown("## Retrieval Generation from PDF files with a pocket size 350 Model from Pleias")
125
+ gr.Markdown("Ask a question and get answers grounded in the content of the uploaded PDF URLs")
126
+
127
+ with gr.Row():
128
+ with gr.Column():
129
+ question = gr.Textbox(label="Your Question", placeholder="What is this document about?")
130
+ pdf_urls = gr.Textbox(lines=5, label="PDF URLs (one per line)",
131
+ placeholder="https://example.com/doc1.pdf\nhttps://example.com/doc2.pdf")
132
+ submit_btn = gr.Button("Submit", variant="primary")
133
+
134
+ with gr.Column():
135
+ output = gr.Markdown(label="Model Response")
136
+ if DEBUG:
137
+ debug_console.render()
138
+ gr.Button("Refresh Debug").click(
139
+ fn=get_debug_log,
140
+ outputs=debug_console
141
+ )
142
+
143
+ submit_btn.click(
144
+ fn=generate_answer,
145
+ inputs=[question, pdf_urls],
146
+ outputs=output
147
+ )
148
 
149
  if __name__ == "__main__":
150
+ log_debug("Launching interface...")
151
+ demo.launch(
152
+ server_port=7860,
153
+ server_name="0.0.0.0",
154
+ show_error=True,
155
+ debug=DEBUG,
156
+ ssl_verify=False # In case there are HTTPS issues with PDF URLs
157
+ )