Pulastya0 commited on
Commit
4e8b374
Β·
1 Parent(s): c876be4

Upload populate_chroma.py

Browse files
Files changed (1) hide show
  1. populate_chroma.py +77 -0
populate_chroma.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import chromadb
4
+ import firebase_admin
5
+ from firebase_admin import credentials, firestore
6
+ from encoder import SentenceEncoder
7
+
8
+ def initialize_firebase():
9
+ """Initializes the Firebase connection using Hugging Face secrets."""
10
+ # Get the JSON credentials from the environment variable
11
+ creds_json_string = os.getenv("FIREBASE_CREDS_JSON")
12
+
13
+ if not creds_json_string:
14
+ print("❌ FIREBASE_CREDS_JSON secret not found. Cannot initialize Firebase.")
15
+ return None
16
+
17
+ try:
18
+ # Convert the JSON string back into a dictionary
19
+ creds_dict = json.loads(creds_json_string)
20
+ cred = credentials.Certificate(creds_dict)
21
+
22
+ # Initialize the app (check if it's already initialized)
23
+ if not firebase_admin._apps:
24
+ firebase_admin.initialize_app(cred)
25
+
26
+ db = firestore.client()
27
+ print("βœ… Firebase connection initialized successfully.")
28
+ return db
29
+ except Exception as e:
30
+ print(f"❌ Could not initialize Firebase. Error: {e}")
31
+ return None
32
+
33
+ def populate_vector_db():
34
+ """
35
+ Reads internships from Firestore, generates embeddings, and populates ChromaDB.
36
+ """
37
+ db = initialize_firebase()
38
+ if db is None:
39
+ return
40
+
41
+ # 1. Initialize other clients
42
+ encoder = SentenceEncoder()
43
+ chroma_client = chromadb.PersistentClient(path="/data/chroma_db")
44
+ collection = chroma_client.get_or_create_collection(name="internships")
45
+
46
+ # 2. Clear existing data in ChromaDB
47
+ if collection.count() > 0:
48
+ print(f"ℹ️ Clearing {collection.count()} existing items from ChromaDB.")
49
+ collection.delete(ids=collection.get()['ids'])
50
+
51
+ # 3. Fetch all data from Firestore
52
+ print("πŸ“š Reading internship data from Firestore...")
53
+ internships_ref = db.collection('internships').stream()
54
+ internships = [doc.to_dict() for doc in internships_ref]
55
+
56
+ if not internships:
57
+ print("❌ No internship data found in Firestore to process.")
58
+ return
59
+
60
+ # 4. Generate embeddings and prepare data for ChromaDB
61
+ print(f"🧠 Generating embeddings for {len(internships)} internships...")
62
+ texts = [f"{i['title']}. {i['description']}. Skills: {', '.join(i['skills'])}" for i in internships]
63
+ embeddings = encoder.encode(texts, show_progress_bar=True).tolist()
64
+ ids = [i['id'] for i in internships]
65
+
66
+ metadatas = []
67
+ for i in internships:
68
+ i['skills'] = json.dumps(i['skills'])
69
+ metadatas.append(i)
70
+
71
+ # 5. Add data to ChromaDB
72
+ print("βž• Adding data to ChromaDB...")
73
+ collection.add(ids=ids, embeddings=embeddings, metadatas=metadatas)
74
+ print(f"βœ… Successfully populated ChromaDB with {collection.count()} items.")
75
+
76
+ if __name__ == "__main__":
77
+ populate_vector_db()