Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import random | |
| import numpy as np | |
| import pandas as pd | |
| from datasets import load_dataset | |
| from sentence_transformers import CrossEncoder | |
| from sklearn.metrics import average_precision_score | |
| import matplotlib.pyplot as plt | |
| import torch | |
| import spaces | |
| import os | |
| from huggingface_hub import HfApi | |
| # Load Hugging Face token from the environment variable | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| if HF_TOKEN is None: | |
| raise ValueError("HF_TOKEN environment variable is not set. Please set it before running the script.") | |
| hf_api = HfApi( | |
| token= HF_TOKEN, # Token is not persisted on the machine. | |
| ) | |
| # Check for GPU support and configure appropriately | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| zero = torch.Tensor([0]).to(device) | |
| print(f"Device being used: {zero.device}") | |
| # Define evaluation metrics | |
| def mean_reciprocal_rank(relevance_labels, scores): | |
| sorted_indices = np.argsort(scores)[::-1] | |
| for rank, idx in enumerate(sorted_indices, start=1): | |
| if relevance_labels[idx] == 1: | |
| return 1 / rank | |
| return 0 | |
| def mean_average_precision(relevance_labels, scores): | |
| return average_precision_score(relevance_labels, scores) | |
| def ndcg_at_k(relevance_labels, scores, k=10): | |
| sorted_indices = np.argsort(scores)[::-1] | |
| relevance_sorted = np.take(relevance_labels, sorted_indices[:k]) | |
| dcg = sum(rel / np.log2(rank + 2) for rank, rel in enumerate(relevance_sorted)) | |
| idcg = sum(1 / np.log2(rank + 2) for rank in range(min(k, sum(relevance_labels)))) | |
| return dcg / idcg if idcg > 0 else 0 | |
| # Load datasets | |
| datasets = { | |
| "Relevance_Labels_Dataset": load_dataset("Omartificial-Intelligence-Space/re-ar-query-candidate" , token =HF_TOKEN )["train"].select(range(300)), | |
| "Positive_Negatives_Dataset": load_dataset("Omartificial-Intelligence-Space/re-ar-test-triplet" , token =HF_TOKEN )["train"].select(range(300)) | |
| } | |
| def evaluate_model_with_insights(model_name): | |
| model = CrossEncoder(model_name, device=device) | |
| results = [] | |
| sample_outputs = [] | |
| for dataset_name, dataset in datasets.items(): | |
| all_mrr, all_map, all_ndcg = [], [], [] | |
| dataset_samples = [] | |
| if 'candidate_document' in dataset.column_names: | |
| grouped_data = dataset.to_pandas().groupby("query") | |
| for query, group in grouped_data: | |
| # Skip invalid queries | |
| if query is None or not isinstance(query, str) or query.strip() == "": | |
| continue | |
| candidate_texts = group['candidate_document'].dropna().tolist() | |
| relevance_labels = group['relevance_label'].tolist() | |
| # Skip if no valid candidate documents | |
| if not candidate_texts or len(candidate_texts) != len(relevance_labels): | |
| continue | |
| pairs = [(query, doc) for doc in candidate_texts if doc is not None and isinstance(doc, str) and doc.strip() != ""] | |
| scores = model.predict(pairs) | |
| # Collecting top-5 results for display | |
| sorted_indices = np.argsort(scores)[::-1] | |
| top_docs = [(candidate_texts[i], scores[i], relevance_labels[i]) for i in sorted_indices[:5]] | |
| dataset_samples.append({ | |
| "Query": query, | |
| "Top 5 Candidates": top_docs | |
| }) | |
| # Metrics | |
| all_mrr.append(mean_reciprocal_rank(relevance_labels, scores)) | |
| all_map.append(mean_average_precision(relevance_labels, scores)) | |
| all_ndcg.append(ndcg_at_k(relevance_labels, scores, k=10)) | |
| else: | |
| for entry in dataset: | |
| query = entry['query'] | |
| # Validate query and documents | |
| if query is None or not isinstance(query, str) or query.strip() == "": | |
| continue | |
| candidate_texts = [ | |
| doc for doc in [entry.get('positive'), entry.get('negative1'), entry.get('negative2'), entry.get('negative3'), entry.get('negative4')] | |
| if doc is not None and isinstance(doc, str) and doc.strip() != "" | |
| ] | |
| relevance_labels = [1] + [0] * (len(candidate_texts) - 1) | |
| # Skip if no valid candidate documents | |
| if not candidate_texts or len(candidate_texts) != len(relevance_labels): | |
| continue | |
| pairs = [(query, doc) for doc in candidate_texts] | |
| scores = model.predict(pairs) | |
| # Collecting top-5 results for display | |
| sorted_indices = np.argsort(scores)[::-1] | |
| top_docs = [(candidate_texts[i], scores[i], relevance_labels[i]) for i in sorted_indices[:5]] | |
| dataset_samples.append({ | |
| "Query": query, | |
| "Top 5 Candidates": top_docs | |
| }) | |
| # Metrics | |
| all_mrr.append(mean_reciprocal_rank(relevance_labels, scores)) | |
| all_map.append(mean_average_precision(relevance_labels, scores)) | |
| all_ndcg.append(ndcg_at_k(relevance_labels, scores, k=10)) | |
| else: | |
| for entry in dataset: | |
| query = entry['query'] | |
| candidate_texts = [entry['positive'], entry['negative1'], entry['negative2'], entry['negative3'], entry['negative4']] | |
| relevance_labels = [1, 0, 0, 0, 0] | |
| pairs = [(query, doc) for doc in candidate_texts] | |
| scores = model.predict(pairs) | |
| # Collecting top-5 results for display | |
| sorted_indices = np.argsort(scores)[::-1] | |
| top_docs = [(candidate_texts[i], scores[i], relevance_labels[i]) for i in sorted_indices[:5]] | |
| dataset_samples.append({ | |
| "Query": query, | |
| "Top 5 Candidates": top_docs | |
| }) | |
| # Metrics | |
| all_mrr.append(mean_reciprocal_rank(relevance_labels, scores)) | |
| all_map.append(mean_average_precision(relevance_labels, scores)) | |
| all_ndcg.append(ndcg_at_k(relevance_labels, scores, k=10)) | |
| # Metrics for this dataset | |
| results.append({ | |
| "Dataset": dataset_name, | |
| "MRR": np.mean(all_mrr), | |
| "MAP": np.mean(all_map), | |
| "nDCG@10": np.mean(all_ndcg) | |
| }) | |
| # Collect sample outputs for inspection | |
| sample_outputs.extend(dataset_samples) | |
| results_df = pd.DataFrame(results) | |
| # Plot results as a bar chart | |
| fig, ax = plt.subplots(figsize=(8, 6)) | |
| results_df.plot(kind='bar', x='Dataset', y=['MRR', 'MAP', 'nDCG@10'], ax=ax) | |
| ax.set_title(f"Evaluation Results for {model_name}") | |
| ax.set_ylabel("Score") | |
| plt.xticks(rotation=0) | |
| return results_df, fig, sample_outputs | |
| # Gradio app interface | |
| def gradio_app_with_insights(model_name): | |
| results_df, chart, samples = evaluate_model_with_insights(model_name) | |
| sample_display = [] | |
| for sample in samples: | |
| sample_display.append(f"Query: {sample['Query']}") | |
| for doc, score, label in sample["Top 5 Candidates"]: | |
| sample_display.append(f" Doc: {doc[:50]}... | Score: {score:.2f} | Relevance: {label}") | |
| sample_display.append("\n") | |
| return results_df, chart, "\n".join(sample_display) | |
| interface = gr.Interface( | |
| fn=gradio_app_with_insights, | |
| inputs=gr.Textbox(label="Enter Model Name", placeholder="e.g., NAMAA-Space/GATE-Reranker-V1"), | |
| outputs=[ | |
| gr.Dataframe(label="Evaluation Results"), | |
| gr.Plot(label="Evaluation Metrics Chart"), | |
| gr.Textbox(label="Sample Reranking Insights", lines=15) | |
| ], | |
| title="Arabic Reranking Model Evaluation and Insights", | |
| description=( | |
| "This app evaluates Arabic reranking models on two datasets:\n" | |
| "1. **Relevance Labels Dataset**\n" | |
| "2. **Positive-Negatives Dataset**\n\n" | |
| "### Metrics Used:\n" | |
| "- **MRR (Mean Reciprocal Rank)**: Measures how quickly the first relevant document appears.\n" | |
| "- **MAP (Mean Average Precision)**: Reflects ranking quality across all relevant documents.\n" | |
| "- **nDCG@10 (Normalized Discounted Cumulative Gain)**: Focuses on the ranking of relevant documents in the top-10.\n\n" | |
| "Input a model name to evaluate its performance, view metrics, and examine sample reranking results." | |
| ) | |
| ) | |
| interface.launch(debug=True) |