Spaces:

beatrizpm
/

csds-project

Sleeping

App Files Files Community

csds-project / preprocess.py

beatrizpm

Upload 9 files

532f1f0 verified about 2 months ago

raw

history blame contribute delete

2.4 kB

	import pandas as pd
	import re
	from config import PROCESSED_DIR, RAW_DIR, MIN_YEAR, MAX_TEXTS
	from utils.console_manager import console_manager

	# Paths
	RAW_CSV = RAW_DIR / "arxiv_astro_ph.csv"
	PROCESSED_CSV = PROCESSED_DIR / "arxiv_astro_ph_2020plus.csv"
	PROCESSED_PARQUET = PROCESSED_DIR / "arxiv_astro_ph_2020plus.parquet"


	def load_filtered():
	usecols = ["id", "title", "abstract", "categories", "update_date"]
	df = pd.read_csv(RAW_CSV, usecols=usecols, dtype={"id": str}, low_memory=False)
	df["update_date"] = pd.to_datetime(df["update_date"], errors="coerce")
	df = df[df["update_date"].dt.year >= MIN_YEAR].copy()
	df["year"] = df["update_date"].dt.year
	df = df[["id", "title", "abstract", "categories", "year"]].reset_index(drop=True)
	return df


	def clean_text(text: str) -> str:
	if not isinstance(text, str):
	return ""
	text = text.replace("\n", " ").replace("\r", " ")
	text = re.sub(r"\s+", " ", text)
	return text.strip()


	def clean_dataframe(df: pd.DataFrame, text_columns=None) -> pd.DataFrame:
	if text_columns is None:
	text_columns = ["title", "abstract"]
	for col in text_columns:
	df[col] = df[col].apply(clean_text)
	return df


	def preprocess_and_save():

	try:
	if PROCESSED_PARQUET.exists() and PROCESSED_CSV.exists():
	console_manager.print_info(
	f"Processed files already exist at {PROCESSED_DIR}. Skipping preprocessing."
	)
	df = pd.read_parquet(PROCESSED_PARQUET)
	console_manager.print_info(
	f"Loaded existing processed data ({df.shape[0]} rows)."
	)
	return df

	with console_manager.status("Processing file...") as status:
	df = load_filtered()
	df = clean_dataframe(df)

	if MAX_TEXTS is not None:
	df = df.head(MAX_TEXTS)
	console_manager.print_info(
	f"Limiting dataset to {len(df)} rows for testing."
	)

	df.to_csv(PROCESSED_CSV, index=False)
	df.to_parquet(PROCESSED_PARQUET, index=False)
	console_manager.print_success(
	f"Pŕe-processing complete. File save in: {PROCESSED_DIR} "
	)

	except Exception as e:
	console_manager.print_error(f"Pré-processing failed: {e}")
	return None
	return df