Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import re | |
| from config import PROCESSED_DIR, RAW_DIR, MIN_YEAR, MAX_TEXTS | |
| from utils.console_manager import console_manager | |
| # Paths | |
| RAW_CSV = RAW_DIR / "arxiv_astro_ph.csv" | |
| PROCESSED_CSV = PROCESSED_DIR / "arxiv_astro_ph_2020plus.csv" | |
| PROCESSED_PARQUET = PROCESSED_DIR / "arxiv_astro_ph_2020plus.parquet" | |
| def load_filtered(): | |
| usecols = ["id", "title", "abstract", "categories", "update_date"] | |
| df = pd.read_csv(RAW_CSV, usecols=usecols, dtype={"id": str}, low_memory=False) | |
| df["update_date"] = pd.to_datetime(df["update_date"], errors="coerce") | |
| df = df[df["update_date"].dt.year >= MIN_YEAR].copy() | |
| df["year"] = df["update_date"].dt.year | |
| df = df[["id", "title", "abstract", "categories", "year"]].reset_index(drop=True) | |
| return df | |
| def clean_text(text: str) -> str: | |
| if not isinstance(text, str): | |
| return "" | |
| text = text.replace("\n", " ").replace("\r", " ") | |
| text = re.sub(r"\s+", " ", text) | |
| return text.strip() | |
| def clean_dataframe(df: pd.DataFrame, text_columns=None) -> pd.DataFrame: | |
| if text_columns is None: | |
| text_columns = ["title", "abstract"] | |
| for col in text_columns: | |
| df[col] = df[col].apply(clean_text) | |
| return df | |
| def preprocess_and_save(): | |
| try: | |
| if PROCESSED_PARQUET.exists() and PROCESSED_CSV.exists(): | |
| console_manager.print_info( | |
| f"Processed files already exist at {PROCESSED_DIR}. Skipping preprocessing." | |
| ) | |
| df = pd.read_parquet(PROCESSED_PARQUET) | |
| console_manager.print_info( | |
| f"Loaded existing processed data ({df.shape[0]} rows)." | |
| ) | |
| return df | |
| with console_manager.status("Processing file...") as status: | |
| df = load_filtered() | |
| df = clean_dataframe(df) | |
| if MAX_TEXTS is not None: | |
| df = df.head(MAX_TEXTS) | |
| console_manager.print_info( | |
| f"Limiting dataset to {len(df)} rows for testing." | |
| ) | |
| df.to_csv(PROCESSED_CSV, index=False) | |
| df.to_parquet(PROCESSED_PARQUET, index=False) | |
| console_manager.print_success( | |
| f"Pŕe-processing complete. File save in: {PROCESSED_DIR} " | |
| ) | |
| except Exception as e: | |
| console_manager.print_error(f"Pré-processing failed: {e}") | |
| return None | |
| return df | |