aelsaeed's picture
Upload app.py
f0d9fe9 verified
raw
history blame
3.16 kB
import os
import pandas as pd
import numpy as np
import requests
import gradio as gr
import gdown
import pickle
BOOKS_FILE = "book.xlsx"
THESES_FILE = "theses.xlsx"
# روابط الملفات على Google Drive
DRIVE_LINKS = {
"books": "1FElHiASfiVLeuHWYaqd2Q5foxWRlJT-O",
"theses": "1K2Mtze6ZdvfKUsFMCOWlRBjDq-ZnJNrv"
}
def download_from_drive(file_id, output):
url = f"https://drive.google.com/uc?export=download&id={file_id}"
gdown.download(url, output, quiet=True)
# تنزيل الملفات إذا مش موجودة
if not os.path.exists(BOOKS_FILE):
download_from_drive(DRIVE_LINKS["books"], BOOKS_FILE)
if not os.path.exists(THESES_FILE):
download_from_drive(DRIVE_LINKS["theses"], THESES_FILE)
# قراءة البيانات
def load_data(file):
df = pd.read_excel(file).fillna("غير متوافر")
if "Title" not in df.columns and "العنوان" in df.columns:
df["Title"] = df["العنوان"].astype(str)
elif "Title" not in df.columns:
df["Title"] = df.iloc[:,0].astype(str)
return df
books_df = load_data(BOOKS_FILE)
theses_df = load_data(THESES_FILE)
API_TOKEN = os.environ.get("HF_TOKEN")
API_URL = "https://api-inference.huggingface.co/models/aelsaeed/all-MiniLM-L6-v2-api"
HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
def get_embedding(text):
response = requests.post(API_URL, headers=HEADERS, json={"inputs": [text]})
return np.array(response.json()[0])
def search(query, category, mode):
if not query.strip():
return "⚠️ اكتب كلمة أو جملة للبحث"
if mode == "نصي":
df = books_df if category=="Books" else theses_df
results = df[df["Title"].str.contains(query, case=False, na=False)]
else:
df = books_df if category=="Books" else theses_df
emb_cache_file = f"{category}_embeddings.pkl"
if os.path.exists(emb_cache_file):
with open(emb_cache_file,"rb") as f:
embeddings = pickle.load(f)
else:
embeddings = np.array([get_embedding(t) for t in df["Title"].tolist()])
with open(emb_cache_file,"wb") as f:
pickle.dump(embeddings,f)
query_emb = get_embedding(query)
scores = np.dot(embeddings, query_emb) / (np.linalg.norm(embeddings,axis=1)*np.linalg.norm(query_emb))
idx = np.argsort(-scores)
results = df.iloc[idx]
if results.empty:
return "❌ لم يتم العثور على نتائج"
html = "<table border=1 style='border-collapse:collapse;width:100%;'>"
html += "<tr>" + "".join([f"<th>{col}</th>" for col in results.columns]) + "</tr>"
for _, row in results.iterrows():
html += "<tr>" + "".join([f"<td>{val}</td>" for val in row.values]) + "</tr>"
html += "</table>"
return html
iface = gr.Interface(
fn=search,
inputs=[
gr.Textbox(label="اكتب كلمة البحث"),
gr.Dropdown(["Books","Theses"], label="الفئة"),
gr.Radio(["نصي","دلالي"], label="نوع البحث")
],
outputs="html",
title="البحث في المكتبة الرقمية"
)
iface.launch()