Spaces:

beatrizpm
/

csds-project

Sleeping

App Files Files Community

beatrizpm commited on Oct 22

Commit

532f1f0

verified ·

1 Parent(s): f26ff01

Upload 9 files

Browse files

Files changed (10) hide show

.gitattributes +1 -0
app_dashboard.py +55 -0
arxiv_astro_ph.csv +3 -0
cluster.py +126 -0
config.py +23 -0
console_manager.py +41 -0
embedder.py +135 -0
latex_to_unicode.py +112 -0
preprocess.py +70 -0
search.py +134 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+arxiv_astro_ph.csv filter=lfs diff=lfs merge=lfs -text

app_dashboard.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import streamlit as st
+from config import EMBEDDINGS_DIR
+from embeddings.search import run_search
+from embeddings.cluster import run_clustering_pipeline
+from embeddings.embedder import (
+    initialize_embedding_model,
+    initialize_chroma,
+    run_pipeline,
+)
+# CONFIGURAÇÃO BÁSICA STREAMLIT
+st.set_page_config(
+    page_title="Semantic Clusters Dashboard",
+    page_icon="🪐",
+    layout="wide",
+)
+st.title("Semantic Clusters Dashboard")
+st.markdown("Visualize document clusters with interactive semantic search.")
+@st.cache_resource
+def get_embeddings_model():
+    return initialize_embedding_model()
+@st.cache_resource
+def get_vectordb():
+    embeddings_model = get_embeddings_model()
+    return initialize_chroma(embeddings_model, EMBEDDINGS_DIR)
+embedding_model = get_embeddings_model()
+vectordb = get_vectordb()
+# INTERFACE PRINCIPAL
+(
+    tab_ingestion,
+    tab_clusters,
+    tab_search,
+) = st.tabs(["Ingestion & Embedding", "3D Clusters", "Semantic Search "])
+with tab_ingestion:
+    run_pipeline(force_run=False)
+with tab_search:
+    run_search(embedding_model=embedding_model, vectordb=vectordb)
+with tab_clusters:
+    st.header("3D Clusters View")
+    if st.button("🌀 Generate clusters"):
+        with st.spinner("Generating clusters..."):
+            run_clustering_pipeline(embedding_model=embedding_model, vectordb=vectordb)
+        st.success("Clusters!")

arxiv_astro_ph.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01679ab07b5d2149eaf37239fef68e0a8dc9b7cc715295c8f559419bdba00b21
+size 553004170

cluster.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import numpy as np
+import pandas as pd
+import streamlit as st
+import umap
+import hdbscan
+import plotly.express as px
+import plotly.graph_objects as go
+from utils.console_manager import console_manager
+from embeddings.embedder import (
+    initialize_chroma,
+    initialize_embedding_model,
+    extract_embeddings,
+)
+def reduce_dimensionality(embeddings: np.ndarray, n_components: int = 3):
+    reducer = umap.UMAP(
+        n_neighbors=15,
+        min_dist=0.1,
+        n_components=n_components,
+        metric="cosine",
+        random_state=42,
+    )
+    embedding_3d = reducer.fit_transform(embeddings)
+    console_manager.print_info(
+        f"UMAP dimensionality reduction done: {embedding_3d.shape}"
+    )
+    return embedding_3d
+def cluster_embeddings(embedding_3d: np.ndarray, min_cluster_size: int = 20):
+    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
+    labels = clusterer.fit_predict(embedding_3d)
+    console_manager.print_info(
+        f"HDBSCAN clustering done: {len(set(labels))} clusters found"
+    )
+    return labels
+def visualize_3d(embedding_3d: np.ndarray, metadata: list, labels: np.ndarray):
+    df_vis = pd.DataFrame(
+        {
+            "x": embedding_3d[:, 0],
+            "y": embedding_3d[:, 1],
+            "z": embedding_3d[:, 2],
+            "title": [m.get("title", "") for m in metadata],
+            "category": [m.get("categories", "") for m in metadata],
+            "year": [m.get("year", 0) for m in metadata],
+            "cluster": labels,
+        }
+    )
+    # Count number of clusters (excluding outliers)
+    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
+    # Define color map (outliers = black)
+    unique_labels = sorted(set(labels))
+    palette = px.colors.qualitative.Plotly
+    color_map = {
+        label: ("black" if label == -1 else palette[label % len(palette)])
+        for label in unique_labels
+    }
+    fig = go.Figure()
+    for label in unique_labels:
+        cluster_points = df_vis[df_vis["cluster"] == label]
+        color = color_map[label]
+        name = f"Cluster {label}" if label != -1 else "Outliers"
+        hover_text = (
+            "Title: %{customdata[0]}<br>"
+            "Category: %{customdata[1]}<br>"
+            "Year: %{customdata[2]}<br>"
+            "Cluster: %{customdata[3]}"
+        )
+        fig.add_trace(
+            go.Scatter3d(
+                x=cluster_points["x"],
+                y=cluster_points["y"],
+                z=cluster_points["z"],
+                mode="markers",
+                marker=dict(size=4, color=color, opacity=0.8),
+                name=name,
+                customdata=np.stack(
+                    [
+                        cluster_points["title"],
+                        cluster_points["category"],
+                        cluster_points["year"],
+                        cluster_points["cluster"],
+                    ],
+                    axis=-1,
+                ),
+                hovertemplate=hover_text,
+            )
+        )
+    fig.update_layout(
+        title=f"Clusters: {n_clusters} ",
+        scene=dict(
+            xaxis_title="Dimension 1",
+            yaxis_title="Dimension 2",
+            zaxis_title="Dimension 3",
+        ),
+        legend=dict(itemsizing="constant"),
+    )
+    st.plotly_chart(fig, use_container_width=True)
+def run_clustering_pipeline(embedding_model=None, vectordb=None):
+    with console_manager.status("Running clustering pipeline..."):
+        if embedding_model is None:
+            embedding_model = initialize_embedding_model()
+        if vectordb is None:
+            vectordb = initialize_chroma(embedding_model)
+        if vectordb is None:
+            st.warning("No ChromaDB found. Run embeddings generation first.")
+            return
+        embeddings, metadata = extract_embeddings(vectordb)
+        embedding_3d = reduce_dimensionality(embeddings)
+        labels = cluster_embeddings(embedding_3d)
+        visualize_3d(embedding_3d, metadata, labels)

config.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from pathlib import Path
+import os
+# Paths base
+BASE_DIR = Path(__file__).resolve().parent.parent
+DATA_DIR = Path(os.environ.get("DATA_DIR", BASE_DIR / "datasets"))
+RAW_DIR = DATA_DIR / "raw"
+PROCESSED_DIR = DATA_DIR / "processed"
+EMBEDDINGS_DIR = DATA_DIR / "embeddings"
+# Ensure directories exist
+for path in [DATA_DIR, RAW_DIR, PROCESSED_DIR, EMBEDDINGS_DIR]:
+    path.mkdir(parents=True, exist_ok=True)
+# Preprocessing configs
+MIN_YEAR = 2020
+MAX_TEXTS = 2000
+# Embeddings configuration
+EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
+# EMBEDDING_MODEL_NAME = "allenai/specter2"

console_manager.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from typing import Any, Dict
+from rich.console import Console
+from rich.markdown import Markdown
+from rich.status import Status
+def dict_to_markdown_table(dictionary: Dict[str, Any]) -> str:
+    table = "| Key | Value |\n| --- | ----- |\n"
+    for key, value in dictionary.items():
+        table += f"| {key} | {value} |\n"
+    return table
+class ConsoleManager:
+    def __init__(self) -> None:
+        self.console = Console()
+    def print_markdown(self, markdown: str) -> None:
+        self.console.print(Markdown(markdown))
+    def print_success(self, message: str) -> None:
+        self.console.print(f"[bold green](success)[/bold green] {message}")
+    def print_error(self, message: str) -> None:
+        self.console.print(f"[bold red](error)[/bold red] {message}")
+    def print_info(self, message: str) -> None:
+        self.console.print(f"[bold blue](info)[/bold blue] {message}")
+    def print_dict(self, dictionary: Dict[str, Any], header: str = "") -> None:
+        markdown_table = dict_to_markdown_table(dictionary)
+        if header:
+            self.print_markdown(f"# {header}\n")
+        self.console.print(Markdown(markdown_table))
+    def status(self, message: str) -> Status:
+        return self.console.status(f"[bold green]{message}[/bold green]")
+console_manager = ConsoleManager()

embedder.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import pandas as pd
+import numpy as np
+import streamlit as st
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_chroma import Chroma
+from pathlib import Path
+from typing import Optional
+from utils.console_manager import console_manager
+from config import EMBEDDINGS_DIR, EMBEDDING_MODEL_NAME
+from data_pipeline.preprocess import PROCESSED_PARQUET
+from data_pipeline.preprocess import preprocess_and_save
+def initialize_embedding_model(
+    model_name: str = EMBEDDING_MODEL_NAME,
+) -> HuggingFaceEmbeddings:
+    embeddings = HuggingFaceEmbeddings(model_name=model_name)
+    console_manager.print_info(f"Initialized embeddings model: {model_name}")
+    return embeddings
+def initialize_chroma(
+    embedding_model: HuggingFaceEmbeddings, chroma_path: Path = EMBEDDINGS_DIR
+) -> Chroma:
+    if chroma_path.exists() and any(chroma_path.iterdir()):
+        console_manager.print_info(f"Loading existing ChromaDB from {chroma_path}")
+    else:
+        console_manager.print_info(f"Creating new ChromaDB at at: {chroma_path}")
+    vectordb = Chroma(
+        persist_directory=str(chroma_path), embedding_function=embedding_model
+    )
+    return vectordb
+def load_preprocessed_data() -> Optional[pd.DataFrame]:
+    if not PROCESSED_PARQUET.exists():
+        console_manager.print_error(f"Processed file not found: {PROCESSED_PARQUET}")
+        return None
+    df = pd.read_parquet(PROCESSED_PARQUET)
+    df["content"] = (
+        "Title: "
+        + df["title"]
+        + ". Abstract: "
+        + df["abstract"]
+        + ". Categories: "
+        + df["categories"].apply(
+            lambda x: ", ".join(x) if isinstance(x, list) else str(x)
+        )
+    )
+    return df
+def prepare_documents(df: pd.DataFrame) -> list[dict]:
+    docs = [
+        {
+            "id": str(i),
+            "content": row["content"],
+            "metadata": {
+                "id": str(i),
+                "title": row["title"],
+                "categories": row["categories"],
+                "year": int(row["year"]),
+            },
+        }
+        for i, row in df.iterrows()
+    ]
+    return docs
+def add_embeddings_to_chroma(vectordb: Chroma, docs: list[dict]):
+    vectordb.add_texts(
+        texts=[d["content"] for d in docs],
+        metadatas=[d["metadata"] for d in docs],
+    )
+    console_manager.print_success("Embeddings generated and stored successfully!")
+def embed_and_store():
+    try:
+        with console_manager.status("Generating embeddings...") as status:
+            embedding_model = initialize_embedding_model()
+            vectordb = initialize_chroma(embedding_model, EMBEDDINGS_DIR)
+            df = load_preprocessed_data()
+            if df is None:
+                return None
+            docs = prepare_documents(df)
+            add_embeddings_to_chroma(vectordb, docs)
+            return vectordb
+    except Exception as e:
+        console_manager.print_error(f"Embedding generation failed: {e}")
+        return None
+def extract_embeddings(_vectordb):
+    collection = _vectordb._collection
+    data = collection.get(include=["metadatas", "documents", "embeddings"])
+    embeddings = np.array(data["embeddings"])
+    metadata = data["metadatas"]
+    return embeddings, metadata
+def run_pipeline(force_run: bool = False):
+    st.header("Ingestion & Embedding")
+    if st.button("Run Ingestion & Embeddings Pipeline"):
+        with st.spinner("Running full pipeline..."):
+            preprocess_and_save()  # Step 1: Extraction and basic cleaning
+            embedding_model = initialize_embedding_model()
+            vectordb = initialize_chroma(embedding_model, EMBEDDINGS_DIR)
+            collection_data = vectordb._collection.get(include=["embeddings"])
+            embeddings = collection_data["embeddings"]
+            if embeddings is not None:
+                if isinstance(embeddings, np.ndarray):
+                    embeddings_exist = embeddings.size > 0
+                else:
+                    embeddings_exist = len(embeddings) > 0
+            else:
+                embeddings_exist = False
+            if embeddings_exist and not force_run:
+                st.warning("Embeddings already exist. Skipping embedding generation.")
+                return vectordb
+            return embed_and_store()
+        st.success("Pipeline finished successfully!")

latex_to_unicode.py ADDED Viewed

	@@ -0,0 +1,112 @@

+LATEX_TO_UNICODE = {
+    # Letras gregas minúsculas
+    r"\alpha": "α",
+    r"\beta": "β",
+    r"\gamma": "γ",
+    r"\delta": "δ",
+    r"\epsilon": "ε",
+    r"\varepsilon": "ε",
+    r"\zeta": "ζ",
+    r"\eta": "η",
+    r"\theta": "θ",
+    r"\vartheta": "ϑ",
+    r"\iota": "ι",
+    r"\kappa": "κ",
+    r"\lambda": "λ",
+    r"\mu": "μ",
+    r"\nu": "ν",
+    r"\xi": "ξ",
+    r"\pi": "π",
+    r"\varpi": "ϖ",
+    r"\rho": "ρ",
+    r"\varrho": "ϱ",
+    r"\sigma": "σ",
+    r"\varsigma": "ς",
+    r"\tau": "τ",
+    r"\upsilon": "υ",
+    r"\phi": "φ",
+    r"\varphi": "ϕ",
+    r"\chi": "χ",
+    r"\psi": "ψ",
+    r"\omega": "ω",
+    # Letras gregas maiúsculas
+    r"\Gamma": "Γ",
+    r"\Delta": "Δ",
+    r"\Theta": "Θ",
+    r"\Lambda": "Λ",
+    r"\Xi": "Ξ",
+    r"\Pi": "Π",
+    r"\Sigma": "Σ",
+    r"\Upsilon": "Υ",
+    r"\Phi": "Φ",
+    r"\Psi": "Ψ",
+    r"\Omega": "Ω",
+    # Operadores e relações
+    r"\times": "×",
+    r"\div": "÷",
+    r"\pm": "±",
+    r"\mp": "∓",
+    r"\cdot": "·",
+    r"\ast": "∗",
+    r"\star": "★",
+    r"\propto": "∝",
+    r"\approx": "≈",
+    r"\sim": "∼",
+    r"\simeq": "≃",
+    r"\equiv": "≡",
+    r"\neq": "≠",
+    r"\geq": "≥",
+    r"\leq": "≤",
+    r"\gg": "≫",
+    r"\ll": "≪",
+    r"\infty": "∞",
+    r"\partial": "∂",
+    r"\nabla": "∇",
+    r"\sum": "Σ",
+    r"\int": "∫",
+    # Setas
+    r"\rightarrow": "→",
+    r"\leftarrow": "←",
+    r"\leftrightarrow": "↔",
+    r"\Rightarrow": "⇒",
+    r"\Leftarrow": "⇐",
+    r"\Leftrightarrow": "⇔",
+    # Símbolos e constantes comuns
+    r"\degree": "°",
+    r"\circ": "°",
+    r"\prime": "′",
+    r"\'": "′",
+    r"\second": "″",
+    r"\ldots": "…",
+    r"\cdots": "⋯",
+    r"\bullet": "•",
+    r"\perp": "⊥",
+    r"\parallel": "∥",
+    r"\rightarrowtail": "↣",
+    r"\leftarrowtail": "↢",
+    # Letras matemáticas estilizadas
+    r"\mathcal{Q}": "𝒬",
+    r"\mathcal{M}": "ℳ",
+    r"\mathcal{L}": "ℒ",
+    r"\mathcal{H}": "ℋ",
+    r"\mathcal{N}": "𝒩",
+    r"\mathbb{R}": "ℝ",
+    r"\mathbb{C}": "ℂ",
+    r"\mathbb{Z}": "ℤ",
+    r"\mathbb{N}": "ℕ",
+    r"\mathbb{Q}": "ℚ",
+    # Unidades físicas e expoentes
+    r"\^-1": "⁻¹",
+    r"\^-2": "⁻²",
+    r"\^-3": "⁻³",
+    r"\^1": "¹",
+    r"\^2": "²",
+    r"\^3": "³",
+    # Outros símbolos úteis
+    r"\dagger": "†",
+    r"\ddagger": "‡",
+    r"\pm": "±",
+    r"\angle": "∠",
+    r"\to": "→",
+    r"\~": "~",
+}

preprocess.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import pandas as pd
+import re
+from config import PROCESSED_DIR, RAW_DIR, MIN_YEAR, MAX_TEXTS
+from utils.console_manager import console_manager
+# Paths
+RAW_CSV = RAW_DIR / "arxiv_astro_ph.csv"
+PROCESSED_CSV = PROCESSED_DIR / "arxiv_astro_ph_2020plus.csv"
+PROCESSED_PARQUET = PROCESSED_DIR / "arxiv_astro_ph_2020plus.parquet"
+def load_filtered():
+    usecols = ["id", "title", "abstract", "categories", "update_date"]
+    df = pd.read_csv(RAW_CSV, usecols=usecols, dtype={"id": str}, low_memory=False)
+    df["update_date"] = pd.to_datetime(df["update_date"], errors="coerce")
+    df = df[df["update_date"].dt.year >= MIN_YEAR].copy()
+    df["year"] = df["update_date"].dt.year
+    df = df[["id", "title", "abstract", "categories", "year"]].reset_index(drop=True)
+    return df
+def clean_text(text: str) -> str:
+    if not isinstance(text, str):
+        return ""
+    text = text.replace("\n", " ").replace("\r", " ")
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
+def clean_dataframe(df: pd.DataFrame, text_columns=None) -> pd.DataFrame:
+    if text_columns is None:
+        text_columns = ["title", "abstract"]
+    for col in text_columns:
+        df[col] = df[col].apply(clean_text)
+    return df
+def preprocess_and_save():
+    try:
+        if PROCESSED_PARQUET.exists() and PROCESSED_CSV.exists():
+            console_manager.print_info(
+                f"Processed files already exist at {PROCESSED_DIR}. Skipping preprocessing."
+            )
+            df = pd.read_parquet(PROCESSED_PARQUET)
+            console_manager.print_info(
+                f"Loaded existing processed data ({df.shape[0]} rows)."
+            )
+            return df
+        with console_manager.status("Processing file...") as status:
+            df = load_filtered()
+            df = clean_dataframe(df)
+            if MAX_TEXTS is not None:
+                df = df.head(MAX_TEXTS)
+                console_manager.print_info(
+                    f"Limiting dataset to {len(df)} rows for testing."
+                )
+            df.to_csv(PROCESSED_CSV, index=False)
+            df.to_parquet(PROCESSED_PARQUET, index=False)
+            console_manager.print_success(
+                f"Pŕe-processing complete. File save in: {PROCESSED_DIR} "
+            )
+    except Exception as e:
+        console_manager.print_error(f"Pré-processing failed: {e}")
+        return None
+    return df

search.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import re
+import torch
+import streamlit as st
+from sentence_transformers import SentenceTransformer, util
+from embeddings.embedder import initialize_embedding_model, initialize_chroma
+from config import EMBEDDINGS_DIR, EMBEDDING_MODEL_NAME
+from embeddings.latex_to_unicode import LATEX_TO_UNICODE
+def decode_latex(text: str) -> str:
+    for latex, uni in LATEX_TO_UNICODE.items():
+        text = text.replace(latex, uni)
+    text = re.sub(r"\\[a-zA-Z]+(\{.*?\})?", "", text)
+    return text.replace("{", "").replace("}", "").strip()
+#  Used to calculate sentences similarity in one file
+sentence_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
+def best_sentence_by_embedding(content: str, query: str):
+    sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", content) if s.strip()]
+    embeddings = sentence_model.encode(sentences + [query], convert_to_tensor=True)
+    cosine_scores = util.cos_sim(embeddings[-1], embeddings[:-1])[0]
+    best_idx = int(torch.argmax(cosine_scores))
+    return sentences[best_idx], cosine_scores[best_idx].item()
+def semantic_search(vectordb, query, k=5):
+    try:
+        results_with_scores = vectordb.similarity_search_with_score(query, k=k)
+    except Exception:
+        raw_results = vectordb.similarity_search(query, k=k)
+        results_with_scores = [(r, None) for r in raw_results]
+    return results_with_scores
+def normalize_score(distance):
+    if distance is None:
+        return 0.0
+    return 1 / (1 + distance)
+def get_user_input():
+    query = st.text_input("Enter search query:")
+    k = st.slider("Number of results", min_value=1, max_value=10, value=5)
+    return query, k
+def truncate_sentence(text: str, max_len: int = 1000) -> str:
+    return text[:max_len] + ("..." if len(text) > max_len else "")
+def process_results(results_with_scores, query):
+    ranked_results = []
+    seen_ids = set()
+    for doc, doc_score in results_with_scores:
+        metadata = doc.metadata or {}
+        doc_id = metadata.get("id", "N/A")
+        if doc_id in seen_ids:
+            continue
+        seen_ids.add(doc_id)
+        categories = metadata.get("categories", "N/A")
+        year = metadata.get("year", "N/A")
+        raw_content = decode_latex(doc.page_content)
+        title = raw_content.split(". ", 1)[0].replace("Title: ", "").strip()
+        content = raw_content.split("Abstract:", 1)[1].strip()
+        best_sentence, local_relevance = best_sentence_by_embedding(content, query)
+        final_score = 0.6 * local_relevance + 0.4 * (1 - doc_score)
+        ranked_results.append(
+            {
+                "doc": doc,
+                "doc_id": doc_id,
+                "categories": categories,
+                "year": year,
+                "title": title,
+                "content": content,
+                "best_sentence": best_sentence,
+                "local_relevance": local_relevance,
+                "doc_score": doc_score,
+                "final_score": final_score,
+            }
+        )
+    return sorted(ranked_results, key=lambda x: x["final_score"], reverse=True)
+def display_results(ranked_results):
+    st.success(f"Top {len(ranked_results)} results found:")
+    for i, r in enumerate(ranked_results, 1):
+        content = r["content"]
+        highlighted_content = content.replace(
+            r["best_sentence"], f"**{r['best_sentence']}**", 1
+        )
+        st.markdown(f"**RESULT {i}:**")
+        st.markdown(
+            f"Document ID: {r['doc_id']} | Categories: {r['categories']} | Year: {r['year']} | "
+            f"Doc Relevance: {1 - (r['doc_score'] if r['doc_score'] else 0):.2f} | "
+            f"Best Sentence Relevance: {r['local_relevance']:.2f}"
+        )
+        st.markdown(f"Title: {r['title']}")
+        st.markdown(f"Most Relevant Excerpt: {truncate_sentence(highlighted_content)}")
+        st.markdown("---")
+def run_search(embedding_model=None, vectordb=None):
+    st.header("🔎 Semantic Search")
+    st.subheader("Search for semantically similar documents")
+    if embedding_model is None:
+        embedding_model = initialize_embedding_model()
+    if vectordb is None:
+        vectordb = initialize_chroma(embedding_model, EMBEDDINGS_DIR)
+    if not vectordb:
+        st.warning("No ChromaDB found. Run embeddings generation first.")
+        return
+    query, k = get_user_input()
+    if not query:
+        st.info("Type a query above to start searching.")
+        return
+    results_with_scores = semantic_search(vectordb, query, k=k * 2)
+    if not results_with_scores:
+        st.warning("No results found.")
+        return
+    ranked_results = process_results(results_with_scores, query)
+    display_results(ranked_results)