beatrizpm commited on
Commit
532f1f0
·
verified ·
1 Parent(s): f26ff01

Upload 9 files

Browse files
Files changed (10) hide show
  1. .gitattributes +1 -0
  2. app_dashboard.py +55 -0
  3. arxiv_astro_ph.csv +3 -0
  4. cluster.py +126 -0
  5. config.py +23 -0
  6. console_manager.py +41 -0
  7. embedder.py +135 -0
  8. latex_to_unicode.py +112 -0
  9. preprocess.py +70 -0
  10. search.py +134 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ arxiv_astro_ph.csv filter=lfs diff=lfs merge=lfs -text
app_dashboard.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from config import EMBEDDINGS_DIR
3
+
4
+ from embeddings.search import run_search
5
+ from embeddings.cluster import run_clustering_pipeline
6
+ from embeddings.embedder import (
7
+ initialize_embedding_model,
8
+ initialize_chroma,
9
+ run_pipeline,
10
+ )
11
+
12
+ # CONFIGURAÇÃO BÁSICA STREAMLIT
13
+ st.set_page_config(
14
+ page_title="Semantic Clusters Dashboard",
15
+ page_icon="🪐",
16
+ layout="wide",
17
+ )
18
+
19
+ st.title("Semantic Clusters Dashboard")
20
+ st.markdown("Visualize document clusters with interactive semantic search.")
21
+
22
+
23
+ @st.cache_resource
24
+ def get_embeddings_model():
25
+ return initialize_embedding_model()
26
+
27
+
28
+ @st.cache_resource
29
+ def get_vectordb():
30
+ embeddings_model = get_embeddings_model()
31
+ return initialize_chroma(embeddings_model, EMBEDDINGS_DIR)
32
+
33
+
34
+ embedding_model = get_embeddings_model()
35
+ vectordb = get_vectordb()
36
+
37
+ # INTERFACE PRINCIPAL
38
+ (
39
+ tab_ingestion,
40
+ tab_clusters,
41
+ tab_search,
42
+ ) = st.tabs(["Ingestion & Embedding", "3D Clusters", "Semantic Search "])
43
+
44
+ with tab_ingestion:
45
+ run_pipeline(force_run=False)
46
+
47
+ with tab_search:
48
+ run_search(embedding_model=embedding_model, vectordb=vectordb)
49
+
50
+ with tab_clusters:
51
+ st.header("3D Clusters View")
52
+ if st.button("🌀 Generate clusters"):
53
+ with st.spinner("Generating clusters..."):
54
+ run_clustering_pipeline(embedding_model=embedding_model, vectordb=vectordb)
55
+ st.success("Clusters!")
arxiv_astro_ph.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01679ab07b5d2149eaf37239fef68e0a8dc9b7cc715295c8f559419bdba00b21
3
+ size 553004170
cluster.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import streamlit as st
4
+ import umap
5
+ import hdbscan
6
+ import plotly.express as px
7
+ import plotly.graph_objects as go
8
+
9
+ from utils.console_manager import console_manager
10
+
11
+ from embeddings.embedder import (
12
+ initialize_chroma,
13
+ initialize_embedding_model,
14
+ extract_embeddings,
15
+ )
16
+
17
+
18
+ def reduce_dimensionality(embeddings: np.ndarray, n_components: int = 3):
19
+ reducer = umap.UMAP(
20
+ n_neighbors=15,
21
+ min_dist=0.1,
22
+ n_components=n_components,
23
+ metric="cosine",
24
+ random_state=42,
25
+ )
26
+ embedding_3d = reducer.fit_transform(embeddings)
27
+ console_manager.print_info(
28
+ f"UMAP dimensionality reduction done: {embedding_3d.shape}"
29
+ )
30
+ return embedding_3d
31
+
32
+
33
+ def cluster_embeddings(embedding_3d: np.ndarray, min_cluster_size: int = 20):
34
+ clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
35
+ labels = clusterer.fit_predict(embedding_3d)
36
+ console_manager.print_info(
37
+ f"HDBSCAN clustering done: {len(set(labels))} clusters found"
38
+ )
39
+ return labels
40
+
41
+
42
+ def visualize_3d(embedding_3d: np.ndarray, metadata: list, labels: np.ndarray):
43
+ df_vis = pd.DataFrame(
44
+ {
45
+ "x": embedding_3d[:, 0],
46
+ "y": embedding_3d[:, 1],
47
+ "z": embedding_3d[:, 2],
48
+ "title": [m.get("title", "") for m in metadata],
49
+ "category": [m.get("categories", "") for m in metadata],
50
+ "year": [m.get("year", 0) for m in metadata],
51
+ "cluster": labels,
52
+ }
53
+ )
54
+
55
+ # Count number of clusters (excluding outliers)
56
+ n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
57
+
58
+ # Define color map (outliers = black)
59
+ unique_labels = sorted(set(labels))
60
+ palette = px.colors.qualitative.Plotly
61
+ color_map = {
62
+ label: ("black" if label == -1 else palette[label % len(palette)])
63
+ for label in unique_labels
64
+ }
65
+
66
+ fig = go.Figure()
67
+
68
+ for label in unique_labels:
69
+ cluster_points = df_vis[df_vis["cluster"] == label]
70
+ color = color_map[label]
71
+ name = f"Cluster {label}" if label != -1 else "Outliers"
72
+
73
+ hover_text = (
74
+ "Title: %{customdata[0]}<br>"
75
+ "Category: %{customdata[1]}<br>"
76
+ "Year: %{customdata[2]}<br>"
77
+ "Cluster: %{customdata[3]}"
78
+ )
79
+
80
+ fig.add_trace(
81
+ go.Scatter3d(
82
+ x=cluster_points["x"],
83
+ y=cluster_points["y"],
84
+ z=cluster_points["z"],
85
+ mode="markers",
86
+ marker=dict(size=4, color=color, opacity=0.8),
87
+ name=name,
88
+ customdata=np.stack(
89
+ [
90
+ cluster_points["title"],
91
+ cluster_points["category"],
92
+ cluster_points["year"],
93
+ cluster_points["cluster"],
94
+ ],
95
+ axis=-1,
96
+ ),
97
+ hovertemplate=hover_text,
98
+ )
99
+ )
100
+
101
+ fig.update_layout(
102
+ title=f"Clusters: {n_clusters} ",
103
+ scene=dict(
104
+ xaxis_title="Dimension 1",
105
+ yaxis_title="Dimension 2",
106
+ zaxis_title="Dimension 3",
107
+ ),
108
+ legend=dict(itemsizing="constant"),
109
+ )
110
+ st.plotly_chart(fig, use_container_width=True)
111
+
112
+
113
+ def run_clustering_pipeline(embedding_model=None, vectordb=None):
114
+ with console_manager.status("Running clustering pipeline..."):
115
+ if embedding_model is None:
116
+ embedding_model = initialize_embedding_model()
117
+ if vectordb is None:
118
+ vectordb = initialize_chroma(embedding_model)
119
+ if vectordb is None:
120
+ st.warning("No ChromaDB found. Run embeddings generation first.")
121
+ return
122
+
123
+ embeddings, metadata = extract_embeddings(vectordb)
124
+ embedding_3d = reduce_dimensionality(embeddings)
125
+ labels = cluster_embeddings(embedding_3d)
126
+ visualize_3d(embedding_3d, metadata, labels)
config.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import os
3
+
4
+ # Paths base
5
+ BASE_DIR = Path(__file__).resolve().parent.parent
6
+ DATA_DIR = Path(os.environ.get("DATA_DIR", BASE_DIR / "datasets"))
7
+
8
+ RAW_DIR = DATA_DIR / "raw"
9
+ PROCESSED_DIR = DATA_DIR / "processed"
10
+ EMBEDDINGS_DIR = DATA_DIR / "embeddings"
11
+
12
+ # Ensure directories exist
13
+ for path in [DATA_DIR, RAW_DIR, PROCESSED_DIR, EMBEDDINGS_DIR]:
14
+ path.mkdir(parents=True, exist_ok=True)
15
+
16
+ # Preprocessing configs
17
+ MIN_YEAR = 2020
18
+ MAX_TEXTS = 2000
19
+
20
+ # Embeddings configuration
21
+ EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
22
+
23
+ # EMBEDDING_MODEL_NAME = "allenai/specter2"
console_manager.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict
2
+ from rich.console import Console
3
+ from rich.markdown import Markdown
4
+ from rich.status import Status
5
+
6
+
7
+ def dict_to_markdown_table(dictionary: Dict[str, Any]) -> str:
8
+ table = "| Key | Value |\n| --- | ----- |\n"
9
+ for key, value in dictionary.items():
10
+ table += f"| {key} | {value} |\n"
11
+ return table
12
+
13
+
14
+ class ConsoleManager:
15
+ def __init__(self) -> None:
16
+ self.console = Console()
17
+
18
+ def print_markdown(self, markdown: str) -> None:
19
+ self.console.print(Markdown(markdown))
20
+
21
+ def print_success(self, message: str) -> None:
22
+ self.console.print(f"[bold green](success)[/bold green] {message}")
23
+
24
+ def print_error(self, message: str) -> None:
25
+ self.console.print(f"[bold red](error)[/bold red] {message}")
26
+
27
+ def print_info(self, message: str) -> None:
28
+ self.console.print(f"[bold blue](info)[/bold blue] {message}")
29
+
30
+ def print_dict(self, dictionary: Dict[str, Any], header: str = "") -> None:
31
+ markdown_table = dict_to_markdown_table(dictionary)
32
+ if header:
33
+ self.print_markdown(f"# {header}\n")
34
+
35
+ self.console.print(Markdown(markdown_table))
36
+
37
+ def status(self, message: str) -> Status:
38
+ return self.console.status(f"[bold green]{message}[/bold green]")
39
+
40
+
41
+ console_manager = ConsoleManager()
embedder.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import streamlit as st
4
+
5
+ from langchain_huggingface import HuggingFaceEmbeddings
6
+ from langchain_chroma import Chroma
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+ from utils.console_manager import console_manager
11
+ from config import EMBEDDINGS_DIR, EMBEDDING_MODEL_NAME
12
+ from data_pipeline.preprocess import PROCESSED_PARQUET
13
+
14
+ from data_pipeline.preprocess import preprocess_and_save
15
+
16
+
17
+ def initialize_embedding_model(
18
+ model_name: str = EMBEDDING_MODEL_NAME,
19
+ ) -> HuggingFaceEmbeddings:
20
+ embeddings = HuggingFaceEmbeddings(model_name=model_name)
21
+ console_manager.print_info(f"Initialized embeddings model: {model_name}")
22
+ return embeddings
23
+
24
+
25
+ def initialize_chroma(
26
+ embedding_model: HuggingFaceEmbeddings, chroma_path: Path = EMBEDDINGS_DIR
27
+ ) -> Chroma:
28
+ if chroma_path.exists() and any(chroma_path.iterdir()):
29
+ console_manager.print_info(f"Loading existing ChromaDB from {chroma_path}")
30
+ else:
31
+ console_manager.print_info(f"Creating new ChromaDB at at: {chroma_path}")
32
+
33
+ vectordb = Chroma(
34
+ persist_directory=str(chroma_path), embedding_function=embedding_model
35
+ )
36
+ return vectordb
37
+
38
+
39
+ def load_preprocessed_data() -> Optional[pd.DataFrame]:
40
+ if not PROCESSED_PARQUET.exists():
41
+ console_manager.print_error(f"Processed file not found: {PROCESSED_PARQUET}")
42
+ return None
43
+
44
+ df = pd.read_parquet(PROCESSED_PARQUET)
45
+ df["content"] = (
46
+ "Title: "
47
+ + df["title"]
48
+ + ". Abstract: "
49
+ + df["abstract"]
50
+ + ". Categories: "
51
+ + df["categories"].apply(
52
+ lambda x: ", ".join(x) if isinstance(x, list) else str(x)
53
+ )
54
+ )
55
+ return df
56
+
57
+
58
+ def prepare_documents(df: pd.DataFrame) -> list[dict]:
59
+ docs = [
60
+ {
61
+ "id": str(i),
62
+ "content": row["content"],
63
+ "metadata": {
64
+ "id": str(i),
65
+ "title": row["title"],
66
+ "categories": row["categories"],
67
+ "year": int(row["year"]),
68
+ },
69
+ }
70
+ for i, row in df.iterrows()
71
+ ]
72
+ return docs
73
+
74
+
75
+ def add_embeddings_to_chroma(vectordb: Chroma, docs: list[dict]):
76
+ vectordb.add_texts(
77
+ texts=[d["content"] for d in docs],
78
+ metadatas=[d["metadata"] for d in docs],
79
+ )
80
+ console_manager.print_success("Embeddings generated and stored successfully!")
81
+
82
+
83
+ def embed_and_store():
84
+ try:
85
+ with console_manager.status("Generating embeddings...") as status:
86
+
87
+ embedding_model = initialize_embedding_model()
88
+ vectordb = initialize_chroma(embedding_model, EMBEDDINGS_DIR)
89
+
90
+ df = load_preprocessed_data()
91
+ if df is None:
92
+ return None
93
+
94
+ docs = prepare_documents(df)
95
+ add_embeddings_to_chroma(vectordb, docs)
96
+ return vectordb
97
+
98
+ except Exception as e:
99
+ console_manager.print_error(f"Embedding generation failed: {e}")
100
+ return None
101
+
102
+
103
+ def extract_embeddings(_vectordb):
104
+ collection = _vectordb._collection
105
+ data = collection.get(include=["metadatas", "documents", "embeddings"])
106
+ embeddings = np.array(data["embeddings"])
107
+ metadata = data["metadatas"]
108
+ return embeddings, metadata
109
+
110
+
111
+ def run_pipeline(force_run: bool = False):
112
+ st.header("Ingestion & Embedding")
113
+
114
+ if st.button("Run Ingestion & Embeddings Pipeline"):
115
+ with st.spinner("Running full pipeline..."):
116
+ preprocess_and_save() # Step 1: Extraction and basic cleaning
117
+ embedding_model = initialize_embedding_model()
118
+ vectordb = initialize_chroma(embedding_model, EMBEDDINGS_DIR)
119
+ collection_data = vectordb._collection.get(include=["embeddings"])
120
+ embeddings = collection_data["embeddings"]
121
+
122
+ if embeddings is not None:
123
+ if isinstance(embeddings, np.ndarray):
124
+ embeddings_exist = embeddings.size > 0
125
+ else:
126
+ embeddings_exist = len(embeddings) > 0
127
+ else:
128
+ embeddings_exist = False
129
+
130
+ if embeddings_exist and not force_run:
131
+ st.warning("Embeddings already exist. Skipping embedding generation.")
132
+ return vectordb
133
+ return embed_and_store()
134
+
135
+ st.success("Pipeline finished successfully!")
latex_to_unicode.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LATEX_TO_UNICODE = {
2
+ # Letras gregas minúsculas
3
+ r"\alpha": "α",
4
+ r"\beta": "β",
5
+ r"\gamma": "γ",
6
+ r"\delta": "δ",
7
+ r"\epsilon": "ε",
8
+ r"\varepsilon": "ε",
9
+ r"\zeta": "ζ",
10
+ r"\eta": "η",
11
+ r"\theta": "θ",
12
+ r"\vartheta": "ϑ",
13
+ r"\iota": "ι",
14
+ r"\kappa": "κ",
15
+ r"\lambda": "λ",
16
+ r"\mu": "μ",
17
+ r"\nu": "ν",
18
+ r"\xi": "ξ",
19
+ r"\pi": "π",
20
+ r"\varpi": "ϖ",
21
+ r"\rho": "ρ",
22
+ r"\varrho": "ϱ",
23
+ r"\sigma": "σ",
24
+ r"\varsigma": "ς",
25
+ r"\tau": "τ",
26
+ r"\upsilon": "υ",
27
+ r"\phi": "φ",
28
+ r"\varphi": "ϕ",
29
+ r"\chi": "χ",
30
+ r"\psi": "ψ",
31
+ r"\omega": "ω",
32
+ # Letras gregas maiúsculas
33
+ r"\Gamma": "Γ",
34
+ r"\Delta": "Δ",
35
+ r"\Theta": "Θ",
36
+ r"\Lambda": "Λ",
37
+ r"\Xi": "Ξ",
38
+ r"\Pi": "Π",
39
+ r"\Sigma": "Σ",
40
+ r"\Upsilon": "Υ",
41
+ r"\Phi": "Φ",
42
+ r"\Psi": "Ψ",
43
+ r"\Omega": "Ω",
44
+ # Operadores e relações
45
+ r"\times": "×",
46
+ r"\div": "÷",
47
+ r"\pm": "±",
48
+ r"\mp": "∓",
49
+ r"\cdot": "·",
50
+ r"\ast": "∗",
51
+ r"\star": "★",
52
+ r"\propto": "∝",
53
+ r"\approx": "≈",
54
+ r"\sim": "∼",
55
+ r"\simeq": "≃",
56
+ r"\equiv": "≡",
57
+ r"\neq": "≠",
58
+ r"\geq": "≥",
59
+ r"\leq": "≤",
60
+ r"\gg": "≫",
61
+ r"\ll": "≪",
62
+ r"\infty": "∞",
63
+ r"\partial": "∂",
64
+ r"\nabla": "∇",
65
+ r"\sum": "Σ",
66
+ r"\int": "∫",
67
+ # Setas
68
+ r"\rightarrow": "→",
69
+ r"\leftarrow": "←",
70
+ r"\leftrightarrow": "↔",
71
+ r"\Rightarrow": "⇒",
72
+ r"\Leftarrow": "⇐",
73
+ r"\Leftrightarrow": "⇔",
74
+ # Símbolos e constantes comuns
75
+ r"\degree": "°",
76
+ r"\circ": "°",
77
+ r"\prime": "′",
78
+ r"\'": "′",
79
+ r"\second": "″",
80
+ r"\ldots": "…",
81
+ r"\cdots": "⋯",
82
+ r"\bullet": "•",
83
+ r"\perp": "⊥",
84
+ r"\parallel": "∥",
85
+ r"\rightarrowtail": "↣",
86
+ r"\leftarrowtail": "↢",
87
+ # Letras matemáticas estilizadas
88
+ r"\mathcal{Q}": "𝒬",
89
+ r"\mathcal{M}": "ℳ",
90
+ r"\mathcal{L}": "ℒ",
91
+ r"\mathcal{H}": "ℋ",
92
+ r"\mathcal{N}": "𝒩",
93
+ r"\mathbb{R}": "ℝ",
94
+ r"\mathbb{C}": "ℂ",
95
+ r"\mathbb{Z}": "ℤ",
96
+ r"\mathbb{N}": "ℕ",
97
+ r"\mathbb{Q}": "ℚ",
98
+ # Unidades físicas e expoentes
99
+ r"\^-1": "⁻¹",
100
+ r"\^-2": "⁻²",
101
+ r"\^-3": "⁻³",
102
+ r"\^1": "¹",
103
+ r"\^2": "²",
104
+ r"\^3": "³",
105
+ # Outros símbolos úteis
106
+ r"\dagger": "†",
107
+ r"\ddagger": "‡",
108
+ r"\pm": "±",
109
+ r"\angle": "∠",
110
+ r"\to": "→",
111
+ r"\~": "~",
112
+ }
preprocess.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ from config import PROCESSED_DIR, RAW_DIR, MIN_YEAR, MAX_TEXTS
4
+ from utils.console_manager import console_manager
5
+
6
+ # Paths
7
+ RAW_CSV = RAW_DIR / "arxiv_astro_ph.csv"
8
+ PROCESSED_CSV = PROCESSED_DIR / "arxiv_astro_ph_2020plus.csv"
9
+ PROCESSED_PARQUET = PROCESSED_DIR / "arxiv_astro_ph_2020plus.parquet"
10
+
11
+
12
+ def load_filtered():
13
+ usecols = ["id", "title", "abstract", "categories", "update_date"]
14
+ df = pd.read_csv(RAW_CSV, usecols=usecols, dtype={"id": str}, low_memory=False)
15
+ df["update_date"] = pd.to_datetime(df["update_date"], errors="coerce")
16
+ df = df[df["update_date"].dt.year >= MIN_YEAR].copy()
17
+ df["year"] = df["update_date"].dt.year
18
+ df = df[["id", "title", "abstract", "categories", "year"]].reset_index(drop=True)
19
+ return df
20
+
21
+
22
+ def clean_text(text: str) -> str:
23
+ if not isinstance(text, str):
24
+ return ""
25
+ text = text.replace("\n", " ").replace("\r", " ")
26
+ text = re.sub(r"\s+", " ", text)
27
+ return text.strip()
28
+
29
+
30
+ def clean_dataframe(df: pd.DataFrame, text_columns=None) -> pd.DataFrame:
31
+ if text_columns is None:
32
+ text_columns = ["title", "abstract"]
33
+ for col in text_columns:
34
+ df[col] = df[col].apply(clean_text)
35
+ return df
36
+
37
+
38
+ def preprocess_and_save():
39
+
40
+ try:
41
+ if PROCESSED_PARQUET.exists() and PROCESSED_CSV.exists():
42
+ console_manager.print_info(
43
+ f"Processed files already exist at {PROCESSED_DIR}. Skipping preprocessing."
44
+ )
45
+ df = pd.read_parquet(PROCESSED_PARQUET)
46
+ console_manager.print_info(
47
+ f"Loaded existing processed data ({df.shape[0]} rows)."
48
+ )
49
+ return df
50
+
51
+ with console_manager.status("Processing file...") as status:
52
+ df = load_filtered()
53
+ df = clean_dataframe(df)
54
+
55
+ if MAX_TEXTS is not None:
56
+ df = df.head(MAX_TEXTS)
57
+ console_manager.print_info(
58
+ f"Limiting dataset to {len(df)} rows for testing."
59
+ )
60
+
61
+ df.to_csv(PROCESSED_CSV, index=False)
62
+ df.to_parquet(PROCESSED_PARQUET, index=False)
63
+ console_manager.print_success(
64
+ f"Pŕe-processing complete. File save in: {PROCESSED_DIR} "
65
+ )
66
+
67
+ except Exception as e:
68
+ console_manager.print_error(f"Pré-processing failed: {e}")
69
+ return None
70
+ return df
search.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import torch
3
+ import streamlit as st
4
+ from sentence_transformers import SentenceTransformer, util
5
+ from embeddings.embedder import initialize_embedding_model, initialize_chroma
6
+ from config import EMBEDDINGS_DIR, EMBEDDING_MODEL_NAME
7
+ from embeddings.latex_to_unicode import LATEX_TO_UNICODE
8
+
9
+
10
+ def decode_latex(text: str) -> str:
11
+ for latex, uni in LATEX_TO_UNICODE.items():
12
+ text = text.replace(latex, uni)
13
+ text = re.sub(r"\\[a-zA-Z]+(\{.*?\})?", "", text)
14
+ return text.replace("{", "").replace("}", "").strip()
15
+
16
+
17
+ # Used to calculate sentences similarity in one file
18
+ sentence_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
19
+
20
+
21
+ def best_sentence_by_embedding(content: str, query: str):
22
+ sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", content) if s.strip()]
23
+ embeddings = sentence_model.encode(sentences + [query], convert_to_tensor=True)
24
+ cosine_scores = util.cos_sim(embeddings[-1], embeddings[:-1])[0]
25
+ best_idx = int(torch.argmax(cosine_scores))
26
+ return sentences[best_idx], cosine_scores[best_idx].item()
27
+
28
+
29
+ def semantic_search(vectordb, query, k=5):
30
+ try:
31
+ results_with_scores = vectordb.similarity_search_with_score(query, k=k)
32
+ except Exception:
33
+ raw_results = vectordb.similarity_search(query, k=k)
34
+ results_with_scores = [(r, None) for r in raw_results]
35
+ return results_with_scores
36
+
37
+
38
+ def normalize_score(distance):
39
+ if distance is None:
40
+ return 0.0
41
+ return 1 / (1 + distance)
42
+
43
+
44
+ def get_user_input():
45
+ query = st.text_input("Enter search query:")
46
+ k = st.slider("Number of results", min_value=1, max_value=10, value=5)
47
+ return query, k
48
+
49
+
50
+ def truncate_sentence(text: str, max_len: int = 1000) -> str:
51
+ return text[:max_len] + ("..." if len(text) > max_len else "")
52
+
53
+
54
+ def process_results(results_with_scores, query):
55
+ ranked_results = []
56
+ seen_ids = set()
57
+
58
+ for doc, doc_score in results_with_scores:
59
+ metadata = doc.metadata or {}
60
+ doc_id = metadata.get("id", "N/A")
61
+ if doc_id in seen_ids:
62
+ continue
63
+ seen_ids.add(doc_id)
64
+
65
+ categories = metadata.get("categories", "N/A")
66
+ year = metadata.get("year", "N/A")
67
+
68
+ raw_content = decode_latex(doc.page_content)
69
+ title = raw_content.split(". ", 1)[0].replace("Title: ", "").strip()
70
+ content = raw_content.split("Abstract:", 1)[1].strip()
71
+
72
+ best_sentence, local_relevance = best_sentence_by_embedding(content, query)
73
+ final_score = 0.6 * local_relevance + 0.4 * (1 - doc_score)
74
+
75
+ ranked_results.append(
76
+ {
77
+ "doc": doc,
78
+ "doc_id": doc_id,
79
+ "categories": categories,
80
+ "year": year,
81
+ "title": title,
82
+ "content": content,
83
+ "best_sentence": best_sentence,
84
+ "local_relevance": local_relevance,
85
+ "doc_score": doc_score,
86
+ "final_score": final_score,
87
+ }
88
+ )
89
+
90
+ return sorted(ranked_results, key=lambda x: x["final_score"], reverse=True)
91
+
92
+
93
+ def display_results(ranked_results):
94
+ st.success(f"Top {len(ranked_results)} results found:")
95
+ for i, r in enumerate(ranked_results, 1):
96
+ content = r["content"]
97
+ highlighted_content = content.replace(
98
+ r["best_sentence"], f"**{r['best_sentence']}**", 1
99
+ )
100
+ st.markdown(f"**RESULT {i}:**")
101
+ st.markdown(
102
+ f"Document ID: {r['doc_id']} | Categories: {r['categories']} | Year: {r['year']} | "
103
+ f"Doc Relevance: {1 - (r['doc_score'] if r['doc_score'] else 0):.2f} | "
104
+ f"Best Sentence Relevance: {r['local_relevance']:.2f}"
105
+ )
106
+ st.markdown(f"Title: {r['title']}")
107
+ st.markdown(f"Most Relevant Excerpt: {truncate_sentence(highlighted_content)}")
108
+ st.markdown("---")
109
+
110
+
111
+ def run_search(embedding_model=None, vectordb=None):
112
+ st.header("🔎 Semantic Search")
113
+ st.subheader("Search for semantically similar documents")
114
+
115
+ if embedding_model is None:
116
+ embedding_model = initialize_embedding_model()
117
+ if vectordb is None:
118
+ vectordb = initialize_chroma(embedding_model, EMBEDDINGS_DIR)
119
+ if not vectordb:
120
+ st.warning("No ChromaDB found. Run embeddings generation first.")
121
+ return
122
+
123
+ query, k = get_user_input()
124
+ if not query:
125
+ st.info("Type a query above to start searching.")
126
+ return
127
+ results_with_scores = semantic_search(vectordb, query, k=k * 2)
128
+
129
+ if not results_with_scores:
130
+ st.warning("No results found.")
131
+ return
132
+
133
+ ranked_results = process_results(results_with_scores, query)
134
+ display_results(ranked_results)