Spaces:
Sleeping
Sleeping
File size: 2,395 Bytes
532f1f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import pandas as pd
import re
from config import PROCESSED_DIR, RAW_DIR, MIN_YEAR, MAX_TEXTS
from utils.console_manager import console_manager
# Paths
RAW_CSV = RAW_DIR / "arxiv_astro_ph.csv"
PROCESSED_CSV = PROCESSED_DIR / "arxiv_astro_ph_2020plus.csv"
PROCESSED_PARQUET = PROCESSED_DIR / "arxiv_astro_ph_2020plus.parquet"
def load_filtered():
usecols = ["id", "title", "abstract", "categories", "update_date"]
df = pd.read_csv(RAW_CSV, usecols=usecols, dtype={"id": str}, low_memory=False)
df["update_date"] = pd.to_datetime(df["update_date"], errors="coerce")
df = df[df["update_date"].dt.year >= MIN_YEAR].copy()
df["year"] = df["update_date"].dt.year
df = df[["id", "title", "abstract", "categories", "year"]].reset_index(drop=True)
return df
def clean_text(text: str) -> str:
if not isinstance(text, str):
return ""
text = text.replace("\n", " ").replace("\r", " ")
text = re.sub(r"\s+", " ", text)
return text.strip()
def clean_dataframe(df: pd.DataFrame, text_columns=None) -> pd.DataFrame:
if text_columns is None:
text_columns = ["title", "abstract"]
for col in text_columns:
df[col] = df[col].apply(clean_text)
return df
def preprocess_and_save():
try:
if PROCESSED_PARQUET.exists() and PROCESSED_CSV.exists():
console_manager.print_info(
f"Processed files already exist at {PROCESSED_DIR}. Skipping preprocessing."
)
df = pd.read_parquet(PROCESSED_PARQUET)
console_manager.print_info(
f"Loaded existing processed data ({df.shape[0]} rows)."
)
return df
with console_manager.status("Processing file...") as status:
df = load_filtered()
df = clean_dataframe(df)
if MAX_TEXTS is not None:
df = df.head(MAX_TEXTS)
console_manager.print_info(
f"Limiting dataset to {len(df)} rows for testing."
)
df.to_csv(PROCESSED_CSV, index=False)
df.to_parquet(PROCESSED_PARQUET, index=False)
console_manager.print_success(
f"Pŕe-processing complete. File save in: {PROCESSED_DIR} "
)
except Exception as e:
console_manager.print_error(f"Pré-processing failed: {e}")
return None
return df
|