Spaces:
Sleeping
Sleeping
| # knowledge_base.py | |
| import os | |
| import fitz # PyMuPDF | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.vectorstores import Chroma | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.docstore.document import Document | |
| CHROMA_DIR = "chroma" | |
| MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" | |
| def load_and_chunk_pdfs(folder_path): | |
| documents = [] | |
| for filename in os.listdir(folder_path): | |
| if filename.endswith(".pdf"): | |
| path = os.path.join(folder_path, filename) | |
| doc = fitz.open(path) | |
| text = "\n".join(page.get_text() for page in doc) | |
| documents.append(Document(page_content=text, metadata={"source": filename})) | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
| chunks = splitter.split_documents(documents) | |
| return chunks | |
| def create_vectorstore(chunks): | |
| embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME) | |
| db = Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_DIR) | |
| db.persist() | |
| return db | |
| def load_vectorstore(): | |
| embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME) | |
| return Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings) | |