Spaces:
Sleeping
Sleeping
| import json | |
| import chromadb | |
| import firebase_admin | |
| from firebase_admin import credentials, firestore | |
| from encoder import SentenceEncoder | |
| def initialize_firebase_with_file(): | |
| """Initializes Firebase using a local serviceAccountKey.json file.""" | |
| try: | |
| # Use the service account key file | |
| cred = credentials.Certificate("serviceAccountKey.json") | |
| if not firebase_admin._apps: | |
| firebase_admin.initialize_app(cred) | |
| db = firestore.client() | |
| print("β Firebase connection initialized from file.") | |
| return db | |
| except Exception as e: | |
| print(f"β Could not initialize Firebase from file. Error: {e}") | |
| print(" - Make sure 'serviceAccountKey.json' has been uploaded to the terminal.") | |
| return None | |
| def populate_vector_db(): | |
| """ | |
| Reads internships from Firestore, generates embeddings, and populates ChromaDB. | |
| """ | |
| db = initialize_firebase_with_file() | |
| if db is None: | |
| return | |
| # 1. Initialize other clients | |
| encoder = SentenceEncoder() | |
| chroma_client = chromadb.PersistentClient(path="/data/chroma_db") | |
| collection = chroma_client.get_or_create_collection(name="internships") | |
| # 2. Clear existing data | |
| if collection.count() > 0: | |
| print(f"βΉοΈ Clearing {collection.count()} existing items from ChromaDB.") | |
| collection.delete(ids=collection.get()['ids']) | |
| # 3. Fetch data from Firestore | |
| print("π Reading internship data from Firestore...") | |
| internships_ref = db.collection('internships').stream() | |
| internships = [doc.to_dict() for doc in internships_ref] | |
| if not internships: | |
| print("β No internship data found in Firestore.") | |
| return | |
| # 4. Generate embeddings | |
| print(f"π§ Generating embeddings for {len(internships)} internships...") | |
| texts = [f"{i['title']}. {i['description']}. Skills: {', '.join(i['skills'])}" for i in internships] | |
| embeddings = encoder.encode(texts, show_progress_bar=True).tolist() | |
| ids = [i['id'] for i in internships] | |
| metadatas = [] | |
| for i in internships: | |
| i['skills'] = json.dumps(i['skills']) | |
| metadatas.append(i) | |
| # 5. Add to ChromaDB | |
| print("β Adding data to ChromaDB...") | |
| collection.add(ids=ids, embeddings=embeddings, metadatas=metadatas) | |
| print(f"β Successfully populated ChromaDB with {collection.count()} items.") | |
| if __name__ == "__main__": | |
| populate_vector_db() |