File size: 2,395 Bytes
532f1f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import pandas as pd
import re
from config import PROCESSED_DIR, RAW_DIR, MIN_YEAR, MAX_TEXTS
from utils.console_manager import console_manager

# Paths
RAW_CSV = RAW_DIR / "arxiv_astro_ph.csv"
PROCESSED_CSV = PROCESSED_DIR / "arxiv_astro_ph_2020plus.csv"
PROCESSED_PARQUET = PROCESSED_DIR / "arxiv_astro_ph_2020plus.parquet"


def load_filtered():
    usecols = ["id", "title", "abstract", "categories", "update_date"]
    df = pd.read_csv(RAW_CSV, usecols=usecols, dtype={"id": str}, low_memory=False)
    df["update_date"] = pd.to_datetime(df["update_date"], errors="coerce")
    df = df[df["update_date"].dt.year >= MIN_YEAR].copy()
    df["year"] = df["update_date"].dt.year
    df = df[["id", "title", "abstract", "categories", "year"]].reset_index(drop=True)
    return df


def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.replace("\n", " ").replace("\r", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def clean_dataframe(df: pd.DataFrame, text_columns=None) -> pd.DataFrame:
    if text_columns is None:
        text_columns = ["title", "abstract"]
    for col in text_columns:
        df[col] = df[col].apply(clean_text)
    return df


def preprocess_and_save():

    try:
        if PROCESSED_PARQUET.exists() and PROCESSED_CSV.exists():
            console_manager.print_info(
                f"Processed files already exist at {PROCESSED_DIR}. Skipping preprocessing."
            )
            df = pd.read_parquet(PROCESSED_PARQUET)
            console_manager.print_info(
                f"Loaded existing processed data ({df.shape[0]} rows)."
            )
            return df

        with console_manager.status("Processing file...") as status:
            df = load_filtered()
            df = clean_dataframe(df)

            if MAX_TEXTS is not None:
                df = df.head(MAX_TEXTS)
                console_manager.print_info(
                    f"Limiting dataset to {len(df)} rows for testing."
                )

            df.to_csv(PROCESSED_CSV, index=False)
            df.to_parquet(PROCESSED_PARQUET, index=False)
            console_manager.print_success(
                f"Pŕe-processing complete. File save in: {PROCESSED_DIR} "
            )

    except Exception as e:
        console_manager.print_error(f"Pré-processing failed: {e}")
        return None
    return df