Spaces:

DurgaDeepak
/

eat2fit

Sleeping

App Files Files Community

DurgaDeepak commited on May 30

Commit

e9706de

verified ·

1 Parent(s): 9b1fba6

Update ingestion.py

Browse files

Files changed (1) hide show

ingestion.py +26 -11

ingestion.py CHANGED Viewed

@@ -1,20 +1,29 @@
 import os
 import glob
 from datasets import Dataset
 from unstructured.partition.pdf import partition_pdf
 from transformers import RagTokenizer
-def ingest_and_push(dataset_name="username/mealplan-chunks"):
-    # Initialize tokenizer for token-aware splitting
-    tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
     texts, sources, pages = [], [], []
     for pdf_path in glob.glob("pdfs/*.pdf"):
         book = os.path.basename(pdf_path)
         pages_data = partition_pdf(filename=pdf_path)
-        for page_num, page in enumerate(pages_data, start=1):
-            # Encode page text into token windows
-            enc = tokenizer(
                 page.text,
                 max_length=800,
                 truncation=True,
@@ -22,14 +31,13 @@ def ingest_and_push(dataset_name="username/mealplan-chunks"):
                 stride=50,
                 return_tensors="pt"
             )
-            # Decode each token window back to text chunk
             for token_ids in enc["input_ids"]:
-                chunk = tokenizer.decode(token_ids, skip_special_tokens=True)
                 texts.append(chunk)
                 sources.append(book)
-                pages.append(page_num)
-    # Build HF Dataset
     ds = Dataset.from_dict({
         "text": texts,
         "source": sources,
@@ -37,5 +45,12 @@ def ingest_and_push(dataset_name="username/mealplan-chunks"):
     })
     ds.push_to_hub(dataset_name, token=True)
 if __name__ == "__main__":
-    ingest_and_push()

 import os
 import glob
+import faiss
+import numpy as np
 from datasets import Dataset
 from unstructured.partition.pdf import partition_pdf
 from transformers import RagTokenizer
+from sentence_transformers import SentenceTransformer
+def ingest_and_push(
+    dataset_name="username/mealplan-chunks",
+    index_path="mealplan.index"
+):
+    # 1) Tokenizer for chunking
+    rag_tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
+    # 2) Embedder for FAISS
+    embedder = SentenceTransformer("all-MiniLM-L6-v2")
     texts, sources, pages = [], [], []
+    # 3) Chunk each PDF
     for pdf_path in glob.glob("pdfs/*.pdf"):
         book = os.path.basename(pdf_path)
         pages_data = partition_pdf(filename=pdf_path)
+        for pg_num, page in enumerate(pages_data, start=1):
+            enc = rag_tokenizer(
                 page.text,
                 max_length=800,
                 truncation=True,
                 stride=50,
                 return_tensors="pt"
             )
             for token_ids in enc["input_ids"]:
+                chunk = rag_tokenizer.decode(token_ids, skip_special_tokens=True)
                 texts.append(chunk)
                 sources.append(book)
+                pages.append(pg_num)
+    # 4) Build HF Dataset
     ds = Dataset.from_dict({
         "text": texts,
         "source": sources,
     })
     ds.push_to_hub(dataset_name, token=True)
+    # 5) Build FAISS index
+    embeddings = embedder.encode(texts, convert_to_numpy=True)
+    dim        = embeddings.shape[1]
+    index      = faiss.IndexFlatL2(dim)       # CPU index
+    index.add(embeddings)
+    faiss.write_index(index, index_path)
 if __name__ == "__main__":
+    ingest_and_push()