DurgaDeepak commited on
Commit
b9a4521
·
verified ·
1 Parent(s): 011fe90

Delete ingestion.py

Browse files
Files changed (1) hide show
  1. ingestion.py +0 -56
ingestion.py DELETED
@@ -1,56 +0,0 @@
1
- import os
2
- import glob
3
- import faiss
4
- import numpy as np
5
- from datasets import Dataset
6
- from unstructured.partition.pdf import partition_pdf
7
- from transformers import RagTokenizer
8
- from sentence_transformers import SentenceTransformer
9
-
10
- def ingest_and_push(
11
- dataset_name="DurgaDeepak/meal_plans",
12
- index_path="mealplan.index"
13
- ):
14
- # 1) Tokenizer for chunking
15
- rag_tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
16
- # 2) Embedder for FAISS
17
- embedder = SentenceTransformer("all-MiniLM-L6-v2")
18
-
19
- texts, sources, pages = [], [], []
20
-
21
- # 3) Chunk each PDF
22
- for pdf_path in glob.glob("meal_plans/*.pdf"):
23
- book = os.path.basename(pdf_path)
24
- pages_data = partition_pdf(filename=pdf_path)
25
- for pg_num, page in enumerate(pages_data, start=1):
26
- enc = rag_tokenizer(
27
- page.text,
28
- max_length=800,
29
- truncation=True,
30
- return_overflowing_tokens=True,
31
- stride=50,
32
- return_tensors="pt"
33
- )
34
- for token_ids in enc["input_ids"]:
35
- chunk = rag_tokenizer.decode(token_ids, skip_special_tokens=True)
36
- texts.append(chunk)
37
- sources.append(book)
38
- pages.append(pg_num)
39
-
40
- # 4) Build HF Dataset
41
- ds = Dataset.from_dict({
42
- "text": texts,
43
- "source": sources,
44
- "page": pages
45
- })
46
- ds.push_to_hub(dataset_name, token=True)
47
-
48
- # 5) Build FAISS index
49
- embeddings = embedder.encode(texts, convert_to_numpy=True)
50
- dim = embeddings.shape[1]
51
- index = faiss.IndexFlatL2(dim) # CPU index
52
- index.add(embeddings)
53
- faiss.write_index(index, index_path)
54
-
55
- if __name__ == "__main__":
56
- ingest_and_push()