Spaces:
Sleeping
Sleeping
File size: 2,472 Bytes
4e8b374 6806256 4e8b374 6806256 4e8b374 6806256 4e8b374 6806256 4e8b374 6806256 4e8b374 6806256 4e8b374 6806256 4e8b374 6806256 4e8b374 6806256 4e8b374 6806256 4e8b374 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import json
import chromadb
import firebase_admin
from firebase_admin import credentials, firestore
from encoder import SentenceEncoder
def initialize_firebase_with_file():
"""Initializes Firebase using a local serviceAccountKey.json file."""
try:
# Use the service account key file
cred = credentials.Certificate("serviceAccountKey.json")
if not firebase_admin._apps:
firebase_admin.initialize_app(cred)
db = firestore.client()
print("β
Firebase connection initialized from file.")
return db
except Exception as e:
print(f"β Could not initialize Firebase from file. Error: {e}")
print(" - Make sure 'serviceAccountKey.json' has been uploaded to the terminal.")
return None
def populate_vector_db():
"""
Reads internships from Firestore, generates embeddings, and populates ChromaDB.
"""
db = initialize_firebase_with_file()
if db is None:
return
# 1. Initialize other clients
encoder = SentenceEncoder()
chroma_client = chromadb.PersistentClient(path="/data/chroma_db")
collection = chroma_client.get_or_create_collection(name="internships")
# 2. Clear existing data
if collection.count() > 0:
print(f"βΉοΈ Clearing {collection.count()} existing items from ChromaDB.")
collection.delete(ids=collection.get()['ids'])
# 3. Fetch data from Firestore
print("π Reading internship data from Firestore...")
internships_ref = db.collection('internships').stream()
internships = [doc.to_dict() for doc in internships_ref]
if not internships:
print("β No internship data found in Firestore.")
return
# 4. Generate embeddings
print(f"π§ Generating embeddings for {len(internships)} internships...")
texts = [f"{i['title']}. {i['description']}. Skills: {', '.join(i['skills'])}" for i in internships]
embeddings = encoder.encode(texts, show_progress_bar=True).tolist()
ids = [i['id'] for i in internships]
metadatas = []
for i in internships:
i['skills'] = json.dumps(i['skills'])
metadatas.append(i)
# 5. Add to ChromaDB
print("β Adding data to ChromaDB...")
collection.add(ids=ids, embeddings=embeddings, metadatas=metadatas)
print(f"β
Successfully populated ChromaDB with {collection.count()} items.")
if __name__ == "__main__":
populate_vector_db() |