Spaces:
Sleeping
Sleeping
| import os | |
| import fitz # PyMuPDF | |
| import requests | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.vectorstores import Chroma | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.docstore.document import Document | |
| CHROMA_DIR = os.path.abspath("chroma") | |
| print("π Loading vectorstore from:", CHROMA_DIR) | |
| MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" | |
| # Set this to your actual file on HF | |
| HF_FILE_URL = "https://huggingface.co/spaces/DurgaDeepak/eat2fit/resolve/main/meal_plans/Lafayette%2C%20Natasha%20-%20Fit%20By%20Tasha%20High%20Protein%20Recipes%20_%2052%20High%20Protein%20Clean%20Recipes%20%26%20Meal%20Plan%20(2021).pdf" | |
| def ensure_pdf_downloaded(local_path: str, url: str): | |
| if not os.path.exists(local_path): | |
| print(f"Downloading large PDF from: {url}") | |
| response = requests.get(url) | |
| if response.status_code == 200: | |
| with open(local_path, "wb") as f: | |
| f.write(response.content) | |
| print("PDF downloaded successfully.") | |
| else: | |
| raise RuntimeError(f"Failed to download PDF: {response.status_code}") | |
| def load_and_chunk_pdfs(folder_path): | |
| documents = [] | |
| for filename in os.listdir(folder_path): | |
| if filename.endswith(".pdf"): | |
| path = os.path.join(folder_path, filename) | |
| # Try downloading the file if it's missing or an LFS pointer | |
| if os.path.getsize(path) < 1000: # LFS pointer files are tiny | |
| ensure_pdf_downloaded(path, HF_FILE_URL) | |
| doc = fitz.open(path) | |
| text = "\n".join(page.get_text() for page in doc if page.get_text()) | |
| documents.append(Document(page_content=text, metadata={"source": filename})) | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
| chunks = splitter.split_documents(documents) | |
| return chunks | |
| def create_vectorstore(chunks): | |
| embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME) | |
| db = Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_DIR) | |
| return db | |
| def load_vectorstore(): | |
| print("π Loading from:", CHROMA_DIR) | |
| embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME) | |
| db = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings) | |
| # Debug block | |
| try: | |
| docs = db.get() | |
| print(f"β Loaded vectorstore with {len(docs['documents'])} docs") | |
| print(f"π§Ύ First doc snippet: {docs['documents'][0][:100]}...") | |
| except Exception as e: | |
| print(f"β Vectorstore load error: {e}") | |
| return db |