csds-project / preprocess.py
beatrizpm's picture
Upload 9 files
532f1f0 verified
import pandas as pd
import re
from config import PROCESSED_DIR, RAW_DIR, MIN_YEAR, MAX_TEXTS
from utils.console_manager import console_manager
# Paths
RAW_CSV = RAW_DIR / "arxiv_astro_ph.csv"
PROCESSED_CSV = PROCESSED_DIR / "arxiv_astro_ph_2020plus.csv"
PROCESSED_PARQUET = PROCESSED_DIR / "arxiv_astro_ph_2020plus.parquet"
def load_filtered():
usecols = ["id", "title", "abstract", "categories", "update_date"]
df = pd.read_csv(RAW_CSV, usecols=usecols, dtype={"id": str}, low_memory=False)
df["update_date"] = pd.to_datetime(df["update_date"], errors="coerce")
df = df[df["update_date"].dt.year >= MIN_YEAR].copy()
df["year"] = df["update_date"].dt.year
df = df[["id", "title", "abstract", "categories", "year"]].reset_index(drop=True)
return df
def clean_text(text: str) -> str:
if not isinstance(text, str):
return ""
text = text.replace("\n", " ").replace("\r", " ")
text = re.sub(r"\s+", " ", text)
return text.strip()
def clean_dataframe(df: pd.DataFrame, text_columns=None) -> pd.DataFrame:
if text_columns is None:
text_columns = ["title", "abstract"]
for col in text_columns:
df[col] = df[col].apply(clean_text)
return df
def preprocess_and_save():
try:
if PROCESSED_PARQUET.exists() and PROCESSED_CSV.exists():
console_manager.print_info(
f"Processed files already exist at {PROCESSED_DIR}. Skipping preprocessing."
)
df = pd.read_parquet(PROCESSED_PARQUET)
console_manager.print_info(
f"Loaded existing processed data ({df.shape[0]} rows)."
)
return df
with console_manager.status("Processing file...") as status:
df = load_filtered()
df = clean_dataframe(df)
if MAX_TEXTS is not None:
df = df.head(MAX_TEXTS)
console_manager.print_info(
f"Limiting dataset to {len(df)} rows for testing."
)
df.to_csv(PROCESSED_CSV, index=False)
df.to_parquet(PROCESSED_PARQUET, index=False)
console_manager.print_success(
f"Pŕe-processing complete. File save in: {PROCESSED_DIR} "
)
except Exception as e:
console_manager.print_error(f"Pré-processing failed: {e}")
return None
return df