Spaces:

Pulastya0
/

SIH-ML-Backend

Sleeping

App Files Files Community

Pulastya0 commited on Sep 12

Commit

4e8b374

1 Parent(s): c876be4

Upload populate_chroma.py

Browse files

Files changed (1) hide show

populate_chroma.py +77 -0

populate_chroma.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import os
+import json
+import chromadb
+import firebase_admin
+from firebase_admin import credentials, firestore
+from encoder import SentenceEncoder
+def initialize_firebase():
+    """Initializes the Firebase connection using Hugging Face secrets."""
+    # Get the JSON credentials from the environment variable
+    creds_json_string = os.getenv("FIREBASE_CREDS_JSON")
+    if not creds_json_string:
+        print("❌ FIREBASE_CREDS_JSON secret not found. Cannot initialize Firebase.")
+        return None
+    try:
+        # Convert the JSON string back into a dictionary
+        creds_dict = json.loads(creds_json_string)
+        cred = credentials.Certificate(creds_dict)
+        # Initialize the app (check if it's already initialized)
+        if not firebase_admin._apps:
+            firebase_admin.initialize_app(cred)
+        db = firestore.client()
+        print("✅ Firebase connection initialized successfully.")
+        return db
+    except Exception as e:
+        print(f"❌ Could not initialize Firebase. Error: {e}")
+        return None
+def populate_vector_db():
+    """
+    Reads internships from Firestore, generates embeddings, and populates ChromaDB.
+    """
+    db = initialize_firebase()
+    if db is None:
+        return
+    # 1. Initialize other clients
+    encoder = SentenceEncoder()
+    chroma_client = chromadb.PersistentClient(path="/data/chroma_db")
+    collection = chroma_client.get_or_create_collection(name="internships")
+    # 2. Clear existing data in ChromaDB
+    if collection.count() > 0:
+        print(f"ℹ️ Clearing {collection.count()} existing items from ChromaDB.")
+        collection.delete(ids=collection.get()['ids'])
+    # 3. Fetch all data from Firestore
+    print("📚 Reading internship data from Firestore...")
+    internships_ref = db.collection('internships').stream()
+    internships = [doc.to_dict() for doc in internships_ref]
+    if not internships:
+        print("❌ No internship data found in Firestore to process.")
+        return
+    # 4. Generate embeddings and prepare data for ChromaDB
+    print(f"🧠 Generating embeddings for {len(internships)} internships...")
+    texts = [f"{i['title']}. {i['description']}. Skills: {', '.join(i['skills'])}" for i in internships]
+    embeddings = encoder.encode(texts, show_progress_bar=True).tolist()
+    ids = [i['id'] for i in internships]
+    metadatas = []
+    for i in internships:
+        i['skills'] = json.dumps(i['skills'])
+        metadatas.append(i)
+    # 5. Add data to ChromaDB
+    print("➕ Adding data to ChromaDB...")
+    collection.add(ids=ids, embeddings=embeddings, metadatas=metadatas)
+    print(f"✅ Successfully populated ChromaDB with {collection.count()} items.")
+if __name__ == "__main__":
+    populate_vector_db()