File size: 3,424 Bytes
de32398
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import gradio as gr
import pandas as pd
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer, util
import gdown
import os

# --------- روابط Google Drive ----------
DRIVE_LINKS = {
    "books": "https://drive.google.com/uc?export=download&id=1FElHiASfiVLeuHWYaqd2Q5foxWRlJT-O",
    "theses": "https://drive.google.com/uc?export=download&id=1K2Mtze6ZdvfKUsFMCOWlRBjDq-ZnJNrv"
}

BOOKS_FILE = "book.xlsx"
THESES_FILE = "theses.xlsx"

# --------- تنزيل الملفات لو مش موجودة ----------
def download_from_drive(link, output):
    if not os.path.exists(output):
        gdown.download(link, output, quiet=False)

download_from_drive(DRIVE_LINKS["books"], BOOKS_FILE)
download_from_drive(DRIVE_LINKS["theses"], THESES_FILE)

# --------- قراءة البيانات ----------
def load_data(file):
    df = pd.read_excel(file).fillna("غير متوافر")
    if "Title" not in df.columns and "العنوان" in df.columns:
        df["Title"] = df["العنوان"].astype(str)
    elif "Title" not in df.columns:
        df["Title"] = df.iloc[:,0].astype(str)
    return df

books_df = load_data(BOOKS_FILE)
theses_df = load_data(THESES_FILE)

# --------- نموذج Semantic ----------
MODEL_NAME = "all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME)

# --------- إنشاء Embeddings مرة واحدة ----------
def build_or_load_embeddings(df, name):
    path = f"{name}_embeddings.pkl"
    if os.path.exists(path):
        with open(path, "rb") as f:
            emb = pickle.load(f)
        if len(emb) == len(df):
            return emb
    texts = df["Title"].astype(str).tolist()
    emb = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
    with open(path, "wb") as f:
        pickle.dump(emb, f)
    return emb

books_embeddings = build_or_load_embeddings(books_df, "books")
theses_embeddings = build_or_load_embeddings(theses_df, "theses")

# --------- دالة البحث ----------
def search(query, category, mode):
    if not query.strip():
        return "⚠️ اكتب كلمة أو جملة للبحث"

    df = books_df if category=="Books" else theses_df
    emb = books_embeddings if category=="Books" else theses_embeddings

    if mode == "نصي":
        results = df[df["Title"].str.contains(query, case=False, na=False)]
    else:
        q_emb = model.encode([query], convert_to_numpy=True)
        scores = util.cos_sim(q_emb, emb)[0].cpu().numpy()
        idx = np.argsort(-scores)
        results = df.iloc[idx]

    if results.empty:
        return "❌ لم يتم العثور على نتائج"

    html = "<table border=1 style='border-collapse:collapse;width:100%;'>"
    html += "<tr>" + "".join([f"<th>{col}</th>" for col in results.columns]) + "</tr>"
    for _, row in results.iterrows():
        html += "<tr>" + "".join([f"<td>{val}</td>" for val in row.values]) + "</tr>"
    html += "</table>"
    return html

# --------- واجهة Gradio ----------
iface = gr.Interface(
    fn=search,
    inputs=[
        gr.Textbox(label="اكتب كلمة البحث"),
        gr.Dropdown(["Books","Theses"], label="الفئة"),
        gr.Radio(["نصي","دلالي"], label="نوع البحث")
    ],
    outputs="html",
    title="البحث في المكتبة الرقمية"
)

iface.launch()