from pathlib import Path import os # Paths base BASE_DIR = Path(__file__).resolve().parent.parent DATA_DIR = Path(os.environ.get("DATA_DIR", BASE_DIR / "datasets")) RAW_DIR = DATA_DIR / "raw" PROCESSED_DIR = DATA_DIR / "processed" EMBEDDINGS_DIR = DATA_DIR / "embeddings" # Ensure directories exist for path in [DATA_DIR, RAW_DIR, PROCESSED_DIR, EMBEDDINGS_DIR]: path.mkdir(parents=True, exist_ok=True) # Preprocessing configs MIN_YEAR = 2020 MAX_TEXTS = 2000 # Embeddings configuration EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" # EMBEDDING_MODEL_NAME = "allenai/specter2"