Spaces:
Sleeping
Sleeping
Upload 9 files
Browse files- .gitattributes +1 -0
- app_dashboard.py +55 -0
- arxiv_astro_ph.csv +3 -0
- cluster.py +126 -0
- config.py +23 -0
- console_manager.py +41 -0
- embedder.py +135 -0
- latex_to_unicode.py +112 -0
- preprocess.py +70 -0
- search.py +134 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
arxiv_astro_ph.csv filter=lfs diff=lfs merge=lfs -text
|
app_dashboard.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from config import EMBEDDINGS_DIR
|
| 3 |
+
|
| 4 |
+
from embeddings.search import run_search
|
| 5 |
+
from embeddings.cluster import run_clustering_pipeline
|
| 6 |
+
from embeddings.embedder import (
|
| 7 |
+
initialize_embedding_model,
|
| 8 |
+
initialize_chroma,
|
| 9 |
+
run_pipeline,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
# CONFIGURAÇÃO BÁSICA STREAMLIT
|
| 13 |
+
st.set_page_config(
|
| 14 |
+
page_title="Semantic Clusters Dashboard",
|
| 15 |
+
page_icon="🪐",
|
| 16 |
+
layout="wide",
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
st.title("Semantic Clusters Dashboard")
|
| 20 |
+
st.markdown("Visualize document clusters with interactive semantic search.")
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@st.cache_resource
|
| 24 |
+
def get_embeddings_model():
|
| 25 |
+
return initialize_embedding_model()
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
@st.cache_resource
|
| 29 |
+
def get_vectordb():
|
| 30 |
+
embeddings_model = get_embeddings_model()
|
| 31 |
+
return initialize_chroma(embeddings_model, EMBEDDINGS_DIR)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
embedding_model = get_embeddings_model()
|
| 35 |
+
vectordb = get_vectordb()
|
| 36 |
+
|
| 37 |
+
# INTERFACE PRINCIPAL
|
| 38 |
+
(
|
| 39 |
+
tab_ingestion,
|
| 40 |
+
tab_clusters,
|
| 41 |
+
tab_search,
|
| 42 |
+
) = st.tabs(["Ingestion & Embedding", "3D Clusters", "Semantic Search "])
|
| 43 |
+
|
| 44 |
+
with tab_ingestion:
|
| 45 |
+
run_pipeline(force_run=False)
|
| 46 |
+
|
| 47 |
+
with tab_search:
|
| 48 |
+
run_search(embedding_model=embedding_model, vectordb=vectordb)
|
| 49 |
+
|
| 50 |
+
with tab_clusters:
|
| 51 |
+
st.header("3D Clusters View")
|
| 52 |
+
if st.button("🌀 Generate clusters"):
|
| 53 |
+
with st.spinner("Generating clusters..."):
|
| 54 |
+
run_clustering_pipeline(embedding_model=embedding_model, vectordb=vectordb)
|
| 55 |
+
st.success("Clusters!")
|
arxiv_astro_ph.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:01679ab07b5d2149eaf37239fef68e0a8dc9b7cc715295c8f559419bdba00b21
|
| 3 |
+
size 553004170
|
cluster.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import streamlit as st
|
| 4 |
+
import umap
|
| 5 |
+
import hdbscan
|
| 6 |
+
import plotly.express as px
|
| 7 |
+
import plotly.graph_objects as go
|
| 8 |
+
|
| 9 |
+
from utils.console_manager import console_manager
|
| 10 |
+
|
| 11 |
+
from embeddings.embedder import (
|
| 12 |
+
initialize_chroma,
|
| 13 |
+
initialize_embedding_model,
|
| 14 |
+
extract_embeddings,
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def reduce_dimensionality(embeddings: np.ndarray, n_components: int = 3):
|
| 19 |
+
reducer = umap.UMAP(
|
| 20 |
+
n_neighbors=15,
|
| 21 |
+
min_dist=0.1,
|
| 22 |
+
n_components=n_components,
|
| 23 |
+
metric="cosine",
|
| 24 |
+
random_state=42,
|
| 25 |
+
)
|
| 26 |
+
embedding_3d = reducer.fit_transform(embeddings)
|
| 27 |
+
console_manager.print_info(
|
| 28 |
+
f"UMAP dimensionality reduction done: {embedding_3d.shape}"
|
| 29 |
+
)
|
| 30 |
+
return embedding_3d
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def cluster_embeddings(embedding_3d: np.ndarray, min_cluster_size: int = 20):
|
| 34 |
+
clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
|
| 35 |
+
labels = clusterer.fit_predict(embedding_3d)
|
| 36 |
+
console_manager.print_info(
|
| 37 |
+
f"HDBSCAN clustering done: {len(set(labels))} clusters found"
|
| 38 |
+
)
|
| 39 |
+
return labels
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def visualize_3d(embedding_3d: np.ndarray, metadata: list, labels: np.ndarray):
|
| 43 |
+
df_vis = pd.DataFrame(
|
| 44 |
+
{
|
| 45 |
+
"x": embedding_3d[:, 0],
|
| 46 |
+
"y": embedding_3d[:, 1],
|
| 47 |
+
"z": embedding_3d[:, 2],
|
| 48 |
+
"title": [m.get("title", "") for m in metadata],
|
| 49 |
+
"category": [m.get("categories", "") for m in metadata],
|
| 50 |
+
"year": [m.get("year", 0) for m in metadata],
|
| 51 |
+
"cluster": labels,
|
| 52 |
+
}
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
# Count number of clusters (excluding outliers)
|
| 56 |
+
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
|
| 57 |
+
|
| 58 |
+
# Define color map (outliers = black)
|
| 59 |
+
unique_labels = sorted(set(labels))
|
| 60 |
+
palette = px.colors.qualitative.Plotly
|
| 61 |
+
color_map = {
|
| 62 |
+
label: ("black" if label == -1 else palette[label % len(palette)])
|
| 63 |
+
for label in unique_labels
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
fig = go.Figure()
|
| 67 |
+
|
| 68 |
+
for label in unique_labels:
|
| 69 |
+
cluster_points = df_vis[df_vis["cluster"] == label]
|
| 70 |
+
color = color_map[label]
|
| 71 |
+
name = f"Cluster {label}" if label != -1 else "Outliers"
|
| 72 |
+
|
| 73 |
+
hover_text = (
|
| 74 |
+
"Title: %{customdata[0]}<br>"
|
| 75 |
+
"Category: %{customdata[1]}<br>"
|
| 76 |
+
"Year: %{customdata[2]}<br>"
|
| 77 |
+
"Cluster: %{customdata[3]}"
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
fig.add_trace(
|
| 81 |
+
go.Scatter3d(
|
| 82 |
+
x=cluster_points["x"],
|
| 83 |
+
y=cluster_points["y"],
|
| 84 |
+
z=cluster_points["z"],
|
| 85 |
+
mode="markers",
|
| 86 |
+
marker=dict(size=4, color=color, opacity=0.8),
|
| 87 |
+
name=name,
|
| 88 |
+
customdata=np.stack(
|
| 89 |
+
[
|
| 90 |
+
cluster_points["title"],
|
| 91 |
+
cluster_points["category"],
|
| 92 |
+
cluster_points["year"],
|
| 93 |
+
cluster_points["cluster"],
|
| 94 |
+
],
|
| 95 |
+
axis=-1,
|
| 96 |
+
),
|
| 97 |
+
hovertemplate=hover_text,
|
| 98 |
+
)
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
fig.update_layout(
|
| 102 |
+
title=f"Clusters: {n_clusters} ",
|
| 103 |
+
scene=dict(
|
| 104 |
+
xaxis_title="Dimension 1",
|
| 105 |
+
yaxis_title="Dimension 2",
|
| 106 |
+
zaxis_title="Dimension 3",
|
| 107 |
+
),
|
| 108 |
+
legend=dict(itemsizing="constant"),
|
| 109 |
+
)
|
| 110 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def run_clustering_pipeline(embedding_model=None, vectordb=None):
|
| 114 |
+
with console_manager.status("Running clustering pipeline..."):
|
| 115 |
+
if embedding_model is None:
|
| 116 |
+
embedding_model = initialize_embedding_model()
|
| 117 |
+
if vectordb is None:
|
| 118 |
+
vectordb = initialize_chroma(embedding_model)
|
| 119 |
+
if vectordb is None:
|
| 120 |
+
st.warning("No ChromaDB found. Run embeddings generation first.")
|
| 121 |
+
return
|
| 122 |
+
|
| 123 |
+
embeddings, metadata = extract_embeddings(vectordb)
|
| 124 |
+
embedding_3d = reduce_dimensionality(embeddings)
|
| 125 |
+
labels = cluster_embeddings(embedding_3d)
|
| 126 |
+
visualize_3d(embedding_3d, metadata, labels)
|
config.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
# Paths base
|
| 5 |
+
BASE_DIR = Path(__file__).resolve().parent.parent
|
| 6 |
+
DATA_DIR = Path(os.environ.get("DATA_DIR", BASE_DIR / "datasets"))
|
| 7 |
+
|
| 8 |
+
RAW_DIR = DATA_DIR / "raw"
|
| 9 |
+
PROCESSED_DIR = DATA_DIR / "processed"
|
| 10 |
+
EMBEDDINGS_DIR = DATA_DIR / "embeddings"
|
| 11 |
+
|
| 12 |
+
# Ensure directories exist
|
| 13 |
+
for path in [DATA_DIR, RAW_DIR, PROCESSED_DIR, EMBEDDINGS_DIR]:
|
| 14 |
+
path.mkdir(parents=True, exist_ok=True)
|
| 15 |
+
|
| 16 |
+
# Preprocessing configs
|
| 17 |
+
MIN_YEAR = 2020
|
| 18 |
+
MAX_TEXTS = 2000
|
| 19 |
+
|
| 20 |
+
# Embeddings configuration
|
| 21 |
+
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
|
| 22 |
+
|
| 23 |
+
# EMBEDDING_MODEL_NAME = "allenai/specter2"
|
console_manager.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any, Dict
|
| 2 |
+
from rich.console import Console
|
| 3 |
+
from rich.markdown import Markdown
|
| 4 |
+
from rich.status import Status
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def dict_to_markdown_table(dictionary: Dict[str, Any]) -> str:
|
| 8 |
+
table = "| Key | Value |\n| --- | ----- |\n"
|
| 9 |
+
for key, value in dictionary.items():
|
| 10 |
+
table += f"| {key} | {value} |\n"
|
| 11 |
+
return table
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ConsoleManager:
|
| 15 |
+
def __init__(self) -> None:
|
| 16 |
+
self.console = Console()
|
| 17 |
+
|
| 18 |
+
def print_markdown(self, markdown: str) -> None:
|
| 19 |
+
self.console.print(Markdown(markdown))
|
| 20 |
+
|
| 21 |
+
def print_success(self, message: str) -> None:
|
| 22 |
+
self.console.print(f"[bold green](success)[/bold green] {message}")
|
| 23 |
+
|
| 24 |
+
def print_error(self, message: str) -> None:
|
| 25 |
+
self.console.print(f"[bold red](error)[/bold red] {message}")
|
| 26 |
+
|
| 27 |
+
def print_info(self, message: str) -> None:
|
| 28 |
+
self.console.print(f"[bold blue](info)[/bold blue] {message}")
|
| 29 |
+
|
| 30 |
+
def print_dict(self, dictionary: Dict[str, Any], header: str = "") -> None:
|
| 31 |
+
markdown_table = dict_to_markdown_table(dictionary)
|
| 32 |
+
if header:
|
| 33 |
+
self.print_markdown(f"# {header}\n")
|
| 34 |
+
|
| 35 |
+
self.console.print(Markdown(markdown_table))
|
| 36 |
+
|
| 37 |
+
def status(self, message: str) -> Status:
|
| 38 |
+
return self.console.status(f"[bold green]{message}[/bold green]")
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
console_manager = ConsoleManager()
|
embedder.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import streamlit as st
|
| 4 |
+
|
| 5 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 6 |
+
from langchain_chroma import Chroma
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Optional
|
| 9 |
+
|
| 10 |
+
from utils.console_manager import console_manager
|
| 11 |
+
from config import EMBEDDINGS_DIR, EMBEDDING_MODEL_NAME
|
| 12 |
+
from data_pipeline.preprocess import PROCESSED_PARQUET
|
| 13 |
+
|
| 14 |
+
from data_pipeline.preprocess import preprocess_and_save
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def initialize_embedding_model(
|
| 18 |
+
model_name: str = EMBEDDING_MODEL_NAME,
|
| 19 |
+
) -> HuggingFaceEmbeddings:
|
| 20 |
+
embeddings = HuggingFaceEmbeddings(model_name=model_name)
|
| 21 |
+
console_manager.print_info(f"Initialized embeddings model: {model_name}")
|
| 22 |
+
return embeddings
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def initialize_chroma(
|
| 26 |
+
embedding_model: HuggingFaceEmbeddings, chroma_path: Path = EMBEDDINGS_DIR
|
| 27 |
+
) -> Chroma:
|
| 28 |
+
if chroma_path.exists() and any(chroma_path.iterdir()):
|
| 29 |
+
console_manager.print_info(f"Loading existing ChromaDB from {chroma_path}")
|
| 30 |
+
else:
|
| 31 |
+
console_manager.print_info(f"Creating new ChromaDB at at: {chroma_path}")
|
| 32 |
+
|
| 33 |
+
vectordb = Chroma(
|
| 34 |
+
persist_directory=str(chroma_path), embedding_function=embedding_model
|
| 35 |
+
)
|
| 36 |
+
return vectordb
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def load_preprocessed_data() -> Optional[pd.DataFrame]:
|
| 40 |
+
if not PROCESSED_PARQUET.exists():
|
| 41 |
+
console_manager.print_error(f"Processed file not found: {PROCESSED_PARQUET}")
|
| 42 |
+
return None
|
| 43 |
+
|
| 44 |
+
df = pd.read_parquet(PROCESSED_PARQUET)
|
| 45 |
+
df["content"] = (
|
| 46 |
+
"Title: "
|
| 47 |
+
+ df["title"]
|
| 48 |
+
+ ". Abstract: "
|
| 49 |
+
+ df["abstract"]
|
| 50 |
+
+ ". Categories: "
|
| 51 |
+
+ df["categories"].apply(
|
| 52 |
+
lambda x: ", ".join(x) if isinstance(x, list) else str(x)
|
| 53 |
+
)
|
| 54 |
+
)
|
| 55 |
+
return df
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def prepare_documents(df: pd.DataFrame) -> list[dict]:
|
| 59 |
+
docs = [
|
| 60 |
+
{
|
| 61 |
+
"id": str(i),
|
| 62 |
+
"content": row["content"],
|
| 63 |
+
"metadata": {
|
| 64 |
+
"id": str(i),
|
| 65 |
+
"title": row["title"],
|
| 66 |
+
"categories": row["categories"],
|
| 67 |
+
"year": int(row["year"]),
|
| 68 |
+
},
|
| 69 |
+
}
|
| 70 |
+
for i, row in df.iterrows()
|
| 71 |
+
]
|
| 72 |
+
return docs
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def add_embeddings_to_chroma(vectordb: Chroma, docs: list[dict]):
|
| 76 |
+
vectordb.add_texts(
|
| 77 |
+
texts=[d["content"] for d in docs],
|
| 78 |
+
metadatas=[d["metadata"] for d in docs],
|
| 79 |
+
)
|
| 80 |
+
console_manager.print_success("Embeddings generated and stored successfully!")
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def embed_and_store():
|
| 84 |
+
try:
|
| 85 |
+
with console_manager.status("Generating embeddings...") as status:
|
| 86 |
+
|
| 87 |
+
embedding_model = initialize_embedding_model()
|
| 88 |
+
vectordb = initialize_chroma(embedding_model, EMBEDDINGS_DIR)
|
| 89 |
+
|
| 90 |
+
df = load_preprocessed_data()
|
| 91 |
+
if df is None:
|
| 92 |
+
return None
|
| 93 |
+
|
| 94 |
+
docs = prepare_documents(df)
|
| 95 |
+
add_embeddings_to_chroma(vectordb, docs)
|
| 96 |
+
return vectordb
|
| 97 |
+
|
| 98 |
+
except Exception as e:
|
| 99 |
+
console_manager.print_error(f"Embedding generation failed: {e}")
|
| 100 |
+
return None
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def extract_embeddings(_vectordb):
|
| 104 |
+
collection = _vectordb._collection
|
| 105 |
+
data = collection.get(include=["metadatas", "documents", "embeddings"])
|
| 106 |
+
embeddings = np.array(data["embeddings"])
|
| 107 |
+
metadata = data["metadatas"]
|
| 108 |
+
return embeddings, metadata
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def run_pipeline(force_run: bool = False):
|
| 112 |
+
st.header("Ingestion & Embedding")
|
| 113 |
+
|
| 114 |
+
if st.button("Run Ingestion & Embeddings Pipeline"):
|
| 115 |
+
with st.spinner("Running full pipeline..."):
|
| 116 |
+
preprocess_and_save() # Step 1: Extraction and basic cleaning
|
| 117 |
+
embedding_model = initialize_embedding_model()
|
| 118 |
+
vectordb = initialize_chroma(embedding_model, EMBEDDINGS_DIR)
|
| 119 |
+
collection_data = vectordb._collection.get(include=["embeddings"])
|
| 120 |
+
embeddings = collection_data["embeddings"]
|
| 121 |
+
|
| 122 |
+
if embeddings is not None:
|
| 123 |
+
if isinstance(embeddings, np.ndarray):
|
| 124 |
+
embeddings_exist = embeddings.size > 0
|
| 125 |
+
else:
|
| 126 |
+
embeddings_exist = len(embeddings) > 0
|
| 127 |
+
else:
|
| 128 |
+
embeddings_exist = False
|
| 129 |
+
|
| 130 |
+
if embeddings_exist and not force_run:
|
| 131 |
+
st.warning("Embeddings already exist. Skipping embedding generation.")
|
| 132 |
+
return vectordb
|
| 133 |
+
return embed_and_store()
|
| 134 |
+
|
| 135 |
+
st.success("Pipeline finished successfully!")
|
latex_to_unicode.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
LATEX_TO_UNICODE = {
|
| 2 |
+
# Letras gregas minúsculas
|
| 3 |
+
r"\alpha": "α",
|
| 4 |
+
r"\beta": "β",
|
| 5 |
+
r"\gamma": "γ",
|
| 6 |
+
r"\delta": "δ",
|
| 7 |
+
r"\epsilon": "ε",
|
| 8 |
+
r"\varepsilon": "ε",
|
| 9 |
+
r"\zeta": "ζ",
|
| 10 |
+
r"\eta": "η",
|
| 11 |
+
r"\theta": "θ",
|
| 12 |
+
r"\vartheta": "ϑ",
|
| 13 |
+
r"\iota": "ι",
|
| 14 |
+
r"\kappa": "κ",
|
| 15 |
+
r"\lambda": "λ",
|
| 16 |
+
r"\mu": "μ",
|
| 17 |
+
r"\nu": "ν",
|
| 18 |
+
r"\xi": "ξ",
|
| 19 |
+
r"\pi": "π",
|
| 20 |
+
r"\varpi": "ϖ",
|
| 21 |
+
r"\rho": "ρ",
|
| 22 |
+
r"\varrho": "ϱ",
|
| 23 |
+
r"\sigma": "σ",
|
| 24 |
+
r"\varsigma": "ς",
|
| 25 |
+
r"\tau": "τ",
|
| 26 |
+
r"\upsilon": "υ",
|
| 27 |
+
r"\phi": "φ",
|
| 28 |
+
r"\varphi": "ϕ",
|
| 29 |
+
r"\chi": "χ",
|
| 30 |
+
r"\psi": "ψ",
|
| 31 |
+
r"\omega": "ω",
|
| 32 |
+
# Letras gregas maiúsculas
|
| 33 |
+
r"\Gamma": "Γ",
|
| 34 |
+
r"\Delta": "Δ",
|
| 35 |
+
r"\Theta": "Θ",
|
| 36 |
+
r"\Lambda": "Λ",
|
| 37 |
+
r"\Xi": "Ξ",
|
| 38 |
+
r"\Pi": "Π",
|
| 39 |
+
r"\Sigma": "Σ",
|
| 40 |
+
r"\Upsilon": "Υ",
|
| 41 |
+
r"\Phi": "Φ",
|
| 42 |
+
r"\Psi": "Ψ",
|
| 43 |
+
r"\Omega": "Ω",
|
| 44 |
+
# Operadores e relações
|
| 45 |
+
r"\times": "×",
|
| 46 |
+
r"\div": "÷",
|
| 47 |
+
r"\pm": "±",
|
| 48 |
+
r"\mp": "∓",
|
| 49 |
+
r"\cdot": "·",
|
| 50 |
+
r"\ast": "∗",
|
| 51 |
+
r"\star": "★",
|
| 52 |
+
r"\propto": "∝",
|
| 53 |
+
r"\approx": "≈",
|
| 54 |
+
r"\sim": "∼",
|
| 55 |
+
r"\simeq": "≃",
|
| 56 |
+
r"\equiv": "≡",
|
| 57 |
+
r"\neq": "≠",
|
| 58 |
+
r"\geq": "≥",
|
| 59 |
+
r"\leq": "≤",
|
| 60 |
+
r"\gg": "≫",
|
| 61 |
+
r"\ll": "≪",
|
| 62 |
+
r"\infty": "∞",
|
| 63 |
+
r"\partial": "∂",
|
| 64 |
+
r"\nabla": "∇",
|
| 65 |
+
r"\sum": "Σ",
|
| 66 |
+
r"\int": "∫",
|
| 67 |
+
# Setas
|
| 68 |
+
r"\rightarrow": "→",
|
| 69 |
+
r"\leftarrow": "←",
|
| 70 |
+
r"\leftrightarrow": "↔",
|
| 71 |
+
r"\Rightarrow": "⇒",
|
| 72 |
+
r"\Leftarrow": "⇐",
|
| 73 |
+
r"\Leftrightarrow": "⇔",
|
| 74 |
+
# Símbolos e constantes comuns
|
| 75 |
+
r"\degree": "°",
|
| 76 |
+
r"\circ": "°",
|
| 77 |
+
r"\prime": "′",
|
| 78 |
+
r"\'": "′",
|
| 79 |
+
r"\second": "″",
|
| 80 |
+
r"\ldots": "…",
|
| 81 |
+
r"\cdots": "⋯",
|
| 82 |
+
r"\bullet": "•",
|
| 83 |
+
r"\perp": "⊥",
|
| 84 |
+
r"\parallel": "∥",
|
| 85 |
+
r"\rightarrowtail": "↣",
|
| 86 |
+
r"\leftarrowtail": "↢",
|
| 87 |
+
# Letras matemáticas estilizadas
|
| 88 |
+
r"\mathcal{Q}": "𝒬",
|
| 89 |
+
r"\mathcal{M}": "ℳ",
|
| 90 |
+
r"\mathcal{L}": "ℒ",
|
| 91 |
+
r"\mathcal{H}": "ℋ",
|
| 92 |
+
r"\mathcal{N}": "𝒩",
|
| 93 |
+
r"\mathbb{R}": "ℝ",
|
| 94 |
+
r"\mathbb{C}": "ℂ",
|
| 95 |
+
r"\mathbb{Z}": "ℤ",
|
| 96 |
+
r"\mathbb{N}": "ℕ",
|
| 97 |
+
r"\mathbb{Q}": "ℚ",
|
| 98 |
+
# Unidades físicas e expoentes
|
| 99 |
+
r"\^-1": "⁻¹",
|
| 100 |
+
r"\^-2": "⁻²",
|
| 101 |
+
r"\^-3": "⁻³",
|
| 102 |
+
r"\^1": "¹",
|
| 103 |
+
r"\^2": "²",
|
| 104 |
+
r"\^3": "³",
|
| 105 |
+
# Outros símbolos úteis
|
| 106 |
+
r"\dagger": "†",
|
| 107 |
+
r"\ddagger": "‡",
|
| 108 |
+
r"\pm": "±",
|
| 109 |
+
r"\angle": "∠",
|
| 110 |
+
r"\to": "→",
|
| 111 |
+
r"\~": "~",
|
| 112 |
+
}
|
preprocess.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import re
|
| 3 |
+
from config import PROCESSED_DIR, RAW_DIR, MIN_YEAR, MAX_TEXTS
|
| 4 |
+
from utils.console_manager import console_manager
|
| 5 |
+
|
| 6 |
+
# Paths
|
| 7 |
+
RAW_CSV = RAW_DIR / "arxiv_astro_ph.csv"
|
| 8 |
+
PROCESSED_CSV = PROCESSED_DIR / "arxiv_astro_ph_2020plus.csv"
|
| 9 |
+
PROCESSED_PARQUET = PROCESSED_DIR / "arxiv_astro_ph_2020plus.parquet"
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def load_filtered():
|
| 13 |
+
usecols = ["id", "title", "abstract", "categories", "update_date"]
|
| 14 |
+
df = pd.read_csv(RAW_CSV, usecols=usecols, dtype={"id": str}, low_memory=False)
|
| 15 |
+
df["update_date"] = pd.to_datetime(df["update_date"], errors="coerce")
|
| 16 |
+
df = df[df["update_date"].dt.year >= MIN_YEAR].copy()
|
| 17 |
+
df["year"] = df["update_date"].dt.year
|
| 18 |
+
df = df[["id", "title", "abstract", "categories", "year"]].reset_index(drop=True)
|
| 19 |
+
return df
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def clean_text(text: str) -> str:
|
| 23 |
+
if not isinstance(text, str):
|
| 24 |
+
return ""
|
| 25 |
+
text = text.replace("\n", " ").replace("\r", " ")
|
| 26 |
+
text = re.sub(r"\s+", " ", text)
|
| 27 |
+
return text.strip()
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def clean_dataframe(df: pd.DataFrame, text_columns=None) -> pd.DataFrame:
|
| 31 |
+
if text_columns is None:
|
| 32 |
+
text_columns = ["title", "abstract"]
|
| 33 |
+
for col in text_columns:
|
| 34 |
+
df[col] = df[col].apply(clean_text)
|
| 35 |
+
return df
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def preprocess_and_save():
|
| 39 |
+
|
| 40 |
+
try:
|
| 41 |
+
if PROCESSED_PARQUET.exists() and PROCESSED_CSV.exists():
|
| 42 |
+
console_manager.print_info(
|
| 43 |
+
f"Processed files already exist at {PROCESSED_DIR}. Skipping preprocessing."
|
| 44 |
+
)
|
| 45 |
+
df = pd.read_parquet(PROCESSED_PARQUET)
|
| 46 |
+
console_manager.print_info(
|
| 47 |
+
f"Loaded existing processed data ({df.shape[0]} rows)."
|
| 48 |
+
)
|
| 49 |
+
return df
|
| 50 |
+
|
| 51 |
+
with console_manager.status("Processing file...") as status:
|
| 52 |
+
df = load_filtered()
|
| 53 |
+
df = clean_dataframe(df)
|
| 54 |
+
|
| 55 |
+
if MAX_TEXTS is not None:
|
| 56 |
+
df = df.head(MAX_TEXTS)
|
| 57 |
+
console_manager.print_info(
|
| 58 |
+
f"Limiting dataset to {len(df)} rows for testing."
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
df.to_csv(PROCESSED_CSV, index=False)
|
| 62 |
+
df.to_parquet(PROCESSED_PARQUET, index=False)
|
| 63 |
+
console_manager.print_success(
|
| 64 |
+
f"Pŕe-processing complete. File save in: {PROCESSED_DIR} "
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
except Exception as e:
|
| 68 |
+
console_manager.print_error(f"Pré-processing failed: {e}")
|
| 69 |
+
return None
|
| 70 |
+
return df
|
search.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import torch
|
| 3 |
+
import streamlit as st
|
| 4 |
+
from sentence_transformers import SentenceTransformer, util
|
| 5 |
+
from embeddings.embedder import initialize_embedding_model, initialize_chroma
|
| 6 |
+
from config import EMBEDDINGS_DIR, EMBEDDING_MODEL_NAME
|
| 7 |
+
from embeddings.latex_to_unicode import LATEX_TO_UNICODE
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def decode_latex(text: str) -> str:
|
| 11 |
+
for latex, uni in LATEX_TO_UNICODE.items():
|
| 12 |
+
text = text.replace(latex, uni)
|
| 13 |
+
text = re.sub(r"\\[a-zA-Z]+(\{.*?\})?", "", text)
|
| 14 |
+
return text.replace("{", "").replace("}", "").strip()
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# Used to calculate sentences similarity in one file
|
| 18 |
+
sentence_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def best_sentence_by_embedding(content: str, query: str):
|
| 22 |
+
sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", content) if s.strip()]
|
| 23 |
+
embeddings = sentence_model.encode(sentences + [query], convert_to_tensor=True)
|
| 24 |
+
cosine_scores = util.cos_sim(embeddings[-1], embeddings[:-1])[0]
|
| 25 |
+
best_idx = int(torch.argmax(cosine_scores))
|
| 26 |
+
return sentences[best_idx], cosine_scores[best_idx].item()
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def semantic_search(vectordb, query, k=5):
|
| 30 |
+
try:
|
| 31 |
+
results_with_scores = vectordb.similarity_search_with_score(query, k=k)
|
| 32 |
+
except Exception:
|
| 33 |
+
raw_results = vectordb.similarity_search(query, k=k)
|
| 34 |
+
results_with_scores = [(r, None) for r in raw_results]
|
| 35 |
+
return results_with_scores
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def normalize_score(distance):
|
| 39 |
+
if distance is None:
|
| 40 |
+
return 0.0
|
| 41 |
+
return 1 / (1 + distance)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def get_user_input():
|
| 45 |
+
query = st.text_input("Enter search query:")
|
| 46 |
+
k = st.slider("Number of results", min_value=1, max_value=10, value=5)
|
| 47 |
+
return query, k
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def truncate_sentence(text: str, max_len: int = 1000) -> str:
|
| 51 |
+
return text[:max_len] + ("..." if len(text) > max_len else "")
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def process_results(results_with_scores, query):
|
| 55 |
+
ranked_results = []
|
| 56 |
+
seen_ids = set()
|
| 57 |
+
|
| 58 |
+
for doc, doc_score in results_with_scores:
|
| 59 |
+
metadata = doc.metadata or {}
|
| 60 |
+
doc_id = metadata.get("id", "N/A")
|
| 61 |
+
if doc_id in seen_ids:
|
| 62 |
+
continue
|
| 63 |
+
seen_ids.add(doc_id)
|
| 64 |
+
|
| 65 |
+
categories = metadata.get("categories", "N/A")
|
| 66 |
+
year = metadata.get("year", "N/A")
|
| 67 |
+
|
| 68 |
+
raw_content = decode_latex(doc.page_content)
|
| 69 |
+
title = raw_content.split(". ", 1)[0].replace("Title: ", "").strip()
|
| 70 |
+
content = raw_content.split("Abstract:", 1)[1].strip()
|
| 71 |
+
|
| 72 |
+
best_sentence, local_relevance = best_sentence_by_embedding(content, query)
|
| 73 |
+
final_score = 0.6 * local_relevance + 0.4 * (1 - doc_score)
|
| 74 |
+
|
| 75 |
+
ranked_results.append(
|
| 76 |
+
{
|
| 77 |
+
"doc": doc,
|
| 78 |
+
"doc_id": doc_id,
|
| 79 |
+
"categories": categories,
|
| 80 |
+
"year": year,
|
| 81 |
+
"title": title,
|
| 82 |
+
"content": content,
|
| 83 |
+
"best_sentence": best_sentence,
|
| 84 |
+
"local_relevance": local_relevance,
|
| 85 |
+
"doc_score": doc_score,
|
| 86 |
+
"final_score": final_score,
|
| 87 |
+
}
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
return sorted(ranked_results, key=lambda x: x["final_score"], reverse=True)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def display_results(ranked_results):
|
| 94 |
+
st.success(f"Top {len(ranked_results)} results found:")
|
| 95 |
+
for i, r in enumerate(ranked_results, 1):
|
| 96 |
+
content = r["content"]
|
| 97 |
+
highlighted_content = content.replace(
|
| 98 |
+
r["best_sentence"], f"**{r['best_sentence']}**", 1
|
| 99 |
+
)
|
| 100 |
+
st.markdown(f"**RESULT {i}:**")
|
| 101 |
+
st.markdown(
|
| 102 |
+
f"Document ID: {r['doc_id']} | Categories: {r['categories']} | Year: {r['year']} | "
|
| 103 |
+
f"Doc Relevance: {1 - (r['doc_score'] if r['doc_score'] else 0):.2f} | "
|
| 104 |
+
f"Best Sentence Relevance: {r['local_relevance']:.2f}"
|
| 105 |
+
)
|
| 106 |
+
st.markdown(f"Title: {r['title']}")
|
| 107 |
+
st.markdown(f"Most Relevant Excerpt: {truncate_sentence(highlighted_content)}")
|
| 108 |
+
st.markdown("---")
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def run_search(embedding_model=None, vectordb=None):
|
| 112 |
+
st.header("🔎 Semantic Search")
|
| 113 |
+
st.subheader("Search for semantically similar documents")
|
| 114 |
+
|
| 115 |
+
if embedding_model is None:
|
| 116 |
+
embedding_model = initialize_embedding_model()
|
| 117 |
+
if vectordb is None:
|
| 118 |
+
vectordb = initialize_chroma(embedding_model, EMBEDDINGS_DIR)
|
| 119 |
+
if not vectordb:
|
| 120 |
+
st.warning("No ChromaDB found. Run embeddings generation first.")
|
| 121 |
+
return
|
| 122 |
+
|
| 123 |
+
query, k = get_user_input()
|
| 124 |
+
if not query:
|
| 125 |
+
st.info("Type a query above to start searching.")
|
| 126 |
+
return
|
| 127 |
+
results_with_scores = semantic_search(vectordb, query, k=k * 2)
|
| 128 |
+
|
| 129 |
+
if not results_with_scores:
|
| 130 |
+
st.warning("No results found.")
|
| 131 |
+
return
|
| 132 |
+
|
| 133 |
+
ranked_results = process_results(results_with_scores, query)
|
| 134 |
+
display_results(ranked_results)
|