aelsaeed's picture
Upload app.py
f0d9fe9 verified
import os
import pandas as pd
import numpy as np
import requests
import gradio as gr
import gdown
import pickle
BOOKS_FILE = "book.xlsx"
THESES_FILE = "theses.xlsx"
# روابط الملفات على Google Drive
DRIVE_LINKS = {
"books": "1FElHiASfiVLeuHWYaqd2Q5foxWRlJT-O",
"theses": "1K2Mtze6ZdvfKUsFMCOWlRBjDq-ZnJNrv"
}
def download_from_drive(file_id, output):
url = f"https://drive.google.com/uc?export=download&id={file_id}"
gdown.download(url, output, quiet=True)
# تنزيل الملفات إذا مش موجودة
if not os.path.exists(BOOKS_FILE):
download_from_drive(DRIVE_LINKS["books"], BOOKS_FILE)
if not os.path.exists(THESES_FILE):
download_from_drive(DRIVE_LINKS["theses"], THESES_FILE)
# قراءة البيانات
def load_data(file):
df = pd.read_excel(file).fillna("غير متوافر")
if "Title" not in df.columns and "العنوان" in df.columns:
df["Title"] = df["العنوان"].astype(str)
elif "Title" not in df.columns:
df["Title"] = df.iloc[:,0].astype(str)
return df
books_df = load_data(BOOKS_FILE)
theses_df = load_data(THESES_FILE)
API_TOKEN = os.environ.get("HF_TOKEN")
API_URL = "https://api-inference.huggingface.co/models/aelsaeed/all-MiniLM-L6-v2-api"
HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
def get_embedding(text):
response = requests.post(API_URL, headers=HEADERS, json={"inputs": [text]})
return np.array(response.json()[0])
def search(query, category, mode):
if not query.strip():
return "⚠️ اكتب كلمة أو جملة للبحث"
if mode == "نصي":
df = books_df if category=="Books" else theses_df
results = df[df["Title"].str.contains(query, case=False, na=False)]
else:
df = books_df if category=="Books" else theses_df
emb_cache_file = f"{category}_embeddings.pkl"
if os.path.exists(emb_cache_file):
with open(emb_cache_file,"rb") as f:
embeddings = pickle.load(f)
else:
embeddings = np.array([get_embedding(t) for t in df["Title"].tolist()])
with open(emb_cache_file,"wb") as f:
pickle.dump(embeddings,f)
query_emb = get_embedding(query)
scores = np.dot(embeddings, query_emb) / (np.linalg.norm(embeddings,axis=1)*np.linalg.norm(query_emb))
idx = np.argsort(-scores)
results = df.iloc[idx]
if results.empty:
return "❌ لم يتم العثور على نتائج"
html = "<table border=1 style='border-collapse:collapse;width:100%;'>"
html += "<tr>" + "".join([f"<th>{col}</th>" for col in results.columns]) + "</tr>"
for _, row in results.iterrows():
html += "<tr>" + "".join([f"<td>{val}</td>" for val in row.values]) + "</tr>"
html += "</table>"
return html
iface = gr.Interface(
fn=search,
inputs=[
gr.Textbox(label="اكتب كلمة البحث"),
gr.Dropdown(["Books","Theses"], label="الفئة"),
gr.Radio(["نصي","دلالي"], label="نوع البحث")
],
outputs="html",
title="البحث في المكتبة الرقمية"
)
iface.launch()