Spaces:

awacke1
/

Gradio-Med-Law-Fin-Scene-Claude

Sleeping

App Files Files Community

awacke1 commited on Jul 19

Commit

55c99d6

verified ·

1 Parent(s): e293bcf

Update app.py

Browse files

Files changed (1) hide show

app.py +176 -30

app.py CHANGED Viewed

@@ -3,48 +3,99 @@ import gradio as gr
 import pandas as pd
 import requests
 import io
-import dask.dataframe as dd
-from datasets import load_dataset, Image
-from mlcroissant import Dataset as CroissantDataset
-from huggingface_hub import get_token
-import polars as pl
 import warnings
 import traceback
 import json
-import tempfile # Added for creating temporary files
-# 🤫 Let's ignore those pesky warnings, shall we?
 warnings.filterwarnings("ignore")
 # --- ⚙️ Configuration & Constants ---
 DATASET_CONFIG = {
     "caselaw": {
         "name": "common-pile/caselaw_access_project", "emoji": "⚖️",
-        "methods": ["💨 API (requests)", "🧊 Dask", "🥐 Croissant"], "is_public": True,
     },
     "prompts": {
         "name": "fka/awesome-chatgpt-prompts", "emoji": "🤖",
-        "methods": ["🐼 Pandas", "💨 API (requests)", "🥐 Croissant"], "is_public": True,
     },
     "finance": {
         "name": "snorkelai/agent-finance-reasoning", "emoji": "💰",
-        "methods": ["🐼 Pandas", "🧊 Polars", "💨 API (requests)", "🥐 Croissant"], "is_public": False,
     },
     "medical": {
         "name": "FreedomIntelligence/medical-o1-reasoning-SFT", "emoji": "🩺",
-        "methods": ["🐼 Pandas", "🧊 Polars", "💨 API (requests)", "🥐 Croissant"], "is_public": False,
     },
     "inscene": {
         "name": "peteromallet/InScene-Dataset", "emoji": "🖼️",
-        "methods": ["🤗 Datasets", "🐼 Pandas", "🧊 Polars", "💨 API (requests)", "🥐 Croissant"], "is_public": False,
     },
 }
 # --- 🔧 Helpers & Utility Functions ---
 def get_auth_headers():
-    token = get_token()
-    return {"Authorization": f"Bearer {token}"} if token else {}
 # --- ✨ FIXED: dataframe_to_outputs to use temporary files ---
 def dataframe_to_outputs(df: pd.DataFrame):
@@ -261,8 +312,24 @@ def fetch_data(dataset_key: str, access_method: str, query: str):
                     outputs[2] = f"✅ Found **{len(all_results_df)}** results so far..."
                     if dataset_key == 'inscene':
-                        gallery_data = [(row['image'], row.get('text', '')) for _, row in all_results_df.iterrows() if 'image' in row and isinstance(row['image'], Image.Image)]
-                        outputs[1] = gr.Gallery(gallery_data, label="🖼️ Image Results", height=400)
                     yield tuple(outputs)
             outputs[2] = f"🏁 Search complete. Found a total of **{len(all_results_df)}** results."
@@ -289,10 +356,14 @@ def fetch_data(dataset_key: str, access_method: str, query: str):
                          df = pd.read_json(f"{file_path}medical_o1_sft.json")
         elif "Datasets" in access_method:
             ds = load_dataset(repo_id, split='train', streaming=True).take(1000)
             df = pd.DataFrame(ds)
         elif "Polars" in access_method:
             outputs[2] = "⏳ Loading with Polars..."
             yield tuple(outputs)
             if repo_id == "fka/awesome-chatgpt-prompts":
@@ -302,22 +373,50 @@ def fetch_data(dataset_key: str, access_method: str, query: str):
             df = pl_df.to_pandas()
         elif "Dask" in access_method:
             outputs[2] = "⏳ Loading with Dask..."
             yield tuple(outputs)
             dask_df = dd.read_json(f"hf://datasets/{repo_id}/**/*.jsonl.gz")
             df = dask_df.head(1000)  # Convert to pandas for processing
         elif "Croissant" in access_method:
             outputs[2] = "⏳ Loading with Croissant..."
             yield tuple(outputs)
-            headers = get_auth_headers() if not config["is_public"] else {}
-            croissant_url = f"https://huggingface.co/api/datasets/{repo_id}/croissant"
-            response = requests.get(croissant_url, headers=headers)
-            response.raise_for_status()
-            jsonld = response.json()
-            ds = CroissantDataset(jsonld=jsonld)
-            records = list(ds.records("default"))[:1000]  # Take first 1000
-            df = pd.DataFrame(records)
         outputs[2] = "🔍 Searching loaded data..."
         yield tuple(outputs)
@@ -329,8 +428,24 @@ def fetch_data(dataset_key: str, access_method: str, query: str):
         outputs[2] = f"🏁 Search complete. Found **{len(final_df)}** results."
         if dataset_key == 'inscene' and not final_df.empty:
-            gallery_data = [(row['image'], row.get('text', '')) for _, row in final_df.iterrows() if 'image' in row and isinstance(row.get('image'), Image.Image)]
-            outputs[1] = gr.Gallery(gallery_data, label="🖼️ Image Results", height=400)
         yield tuple(outputs)
@@ -347,9 +462,21 @@ def create_dataset_tab(dataset_key: str):
         if not config['is_public']:
             gr.Markdown("**Note:** This is a gated dataset. Please log in via `huggingface-cli login` in your terminal first.")
         with gr.Row():
-            access_method = gr.Radio(config['methods'], label="🔑 Access Method", value=config['methods'][0])
-            query = gr.Textbox(label="🔍 Search Query", placeholder="Enter any text to search, or leave blank for samples...")
         fetch_button = gr.Button("🚀 Go Fetch!")
         status_output = gr.Markdown("🏁 Ready to search.")
@@ -385,7 +512,20 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as
         "If an error occurs, a detailed debug log will appear to help troubleshoot the issue."
     )
-    with gr.Accordion("🔧 Quick Start Guide", open=False):
         gr.Markdown("""
         ### 🚀 Quick Start:
         1. **🤖 Prompts Tab**: Try API method, search for "translator" or "linux"
@@ -402,7 +542,13 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as
         - **🐼 Pandas**: Full dataset access, requires login for gated datasets
         - **🤗 Datasets**: Good for streaming large datasets
         - **🧊 Polars/Dask**: Alternative fast data processing
-        - **🥐 Croissant**: Metadata-aware loading
         """)
     with gr.Tabs():

 import pandas as pd
 import requests
 import io
 import warnings
 import traceback
 import json
+import tempfile
+import os
+import logging
+# 🤫 Suppress warnings and set logging levels
 warnings.filterwarnings("ignore")
+logging.getLogger("absl").setLevel(logging.ERROR)  # Suppress MLCroissant warnings
+os.environ["ABSL_LOG_LEVEL"] = "2"  # Only show errors
+# Import optional dependencies with fallbacks
+try:
+    import dask.dataframe as dd
+    DASK_AVAILABLE = True
+except ImportError:
+    DASK_AVAILABLE = False
+try:
+    from datasets import load_dataset, Image
+    DATASETS_AVAILABLE = True
+except ImportError:
+    DATASETS_AVAILABLE = False
+try:
+    from mlcroissant import Dataset as CroissantDataset
+    CROISSANT_AVAILABLE = True
+except ImportError:
+    CROISSANT_AVAILABLE = False
+try:
+    from huggingface_hub import get_token
+    HF_HUB_AVAILABLE = True
+except ImportError:
+    HF_HUB_AVAILABLE = False
+try:
+    import polars as pl
+    POLARS_AVAILABLE = True
+except ImportError:
+    POLARS_AVAILABLE = False
 # --- ⚙️ Configuration & Constants ---
+def get_available_methods():
+    """🔧 Get available methods based on installed dependencies"""
+    base_methods = ["💨 API (requests)", "🐼 Pandas"]
+    if DATASETS_AVAILABLE:
+        base_methods.append("🤗 Datasets")
+    if POLARS_AVAILABLE:
+        base_methods.append("🧊 Polars")
+    if DASK_AVAILABLE:
+        base_methods.append("🧊 Dask")
+    if CROISSANT_AVAILABLE:
+        base_methods.append("🥐 Croissant")
+    return base_methods
 DATASET_CONFIG = {
     "caselaw": {
         "name": "common-pile/caselaw_access_project", "emoji": "⚖️",
+        "methods": get_available_methods(), "is_public": True,
     },
     "prompts": {
         "name": "fka/awesome-chatgpt-prompts", "emoji": "🤖",
+        "methods": get_available_methods(), "is_public": True,
     },
     "finance": {
         "name": "snorkelai/agent-finance-reasoning", "emoji": "💰",
+        "methods": get_available_methods(), "is_public": False,
     },
     "medical": {
         "name": "FreedomIntelligence/medical-o1-reasoning-SFT", "emoji": "🩺",
+        "methods": get_available_methods(), "is_public": False,
     },
     "inscene": {
         "name": "peteromallet/InScene-Dataset", "emoji": "🖼️",
+        "methods": get_available_methods(), "is_public": False,
     },
 }
 # --- 🔧 Helpers & Utility Functions ---
 def get_auth_headers():
+    """🔑 Get authentication headers if available"""
+    if not HF_HUB_AVAILABLE:
+        return {}
+    try:
+        token = get_token()
+        return {"Authorization": f"Bearer {token}"} if token else {}
+    except Exception:
+        return {}
 # --- ✨ FIXED: dataframe_to_outputs to use temporary files ---
 def dataframe_to_outputs(df: pd.DataFrame):
                     outputs[2] = f"✅ Found **{len(all_results_df)}** results so far..."
                     if dataset_key == 'inscene':
+                        try:
+                            gallery_data = []
+                            for _, row in all_results_df.iterrows():
+                                if 'image' in row:
+                                    image_data = row.get('image')
+                                    text_data = row.get('text', '')
+                                    # Handle different image formats safely
+                                    if hasattr(image_data, 'save'):  # PIL Image
+                                        gallery_data.append((image_data, text_data))
+                                    elif isinstance(image_data, str):  # Image path or URL
+                                        gallery_data.append((image_data, text_data))
+                            if gallery_data:
+                                outputs[1] = gr.Gallery(gallery_data, label="🖼️ Image Results", height=400)
+                        except Exception as img_error:
+                            # Don't break the flow for image errors
+                            pass
                     yield tuple(outputs)
             outputs[2] = f"🏁 Search complete. Found a total of **{len(all_results_df)}** results."
                          df = pd.read_json(f"{file_path}medical_o1_sft.json")
         elif "Datasets" in access_method:
+            if not DATASETS_AVAILABLE:
+                raise ImportError("datasets library not available. Install with: pip install datasets")
             ds = load_dataset(repo_id, split='train', streaming=True).take(1000)
             df = pd.DataFrame(ds)
         elif "Polars" in access_method:
+            if not POLARS_AVAILABLE:
+                raise ImportError("polars library not available. Install with: pip install polars")
             outputs[2] = "⏳ Loading with Polars..."
             yield tuple(outputs)
             if repo_id == "fka/awesome-chatgpt-prompts":
             df = pl_df.to_pandas()
         elif "Dask" in access_method:
+            if not DASK_AVAILABLE:
+                raise ImportError("dask library not available. Install with: pip install dask")
             outputs[2] = "⏳ Loading with Dask..."
             yield tuple(outputs)
             dask_df = dd.read_json(f"hf://datasets/{repo_id}/**/*.jsonl.gz")
             df = dask_df.head(1000)  # Convert to pandas for processing
         elif "Croissant" in access_method:
+            if not CROISSANT_AVAILABLE:
+                raise ImportError("mlcroissant library not available. Install with: pip install mlcroissant")
             outputs[2] = "⏳ Loading with Croissant..."
             yield tuple(outputs)
+            try:
+                headers = get_auth_headers() if not config["is_public"] else {}
+                croissant_url = f"https://huggingface.co/api/datasets/{repo_id}/croissant"
+                response = requests.get(croissant_url, headers=headers)
+                response.raise_for_status()
+                jsonld = response.json()
+                # Suppress MLCroissant warnings during dataset creation
+                with warnings.catch_warnings():
+                    warnings.simplefilter("ignore")
+                    ds = CroissantDataset(jsonld=jsonld)
+                    records = list(ds.records("default"))[:1000]  # Take first 1000
+                    df = pd.DataFrame(records)
+            except Exception as croissant_error:
+                # If Croissant fails, fall back to API method
+                outputs[2] = f"⚠️ Croissant method failed, falling back to API method..."
+                yield tuple(outputs)
+                # Retry with API method
+                url = f"https://datasets-server.huggingface.co/rows?dataset={repo_id}&config=default&split=train&offset=0&length=100"
+                headers = get_auth_headers() if not config["is_public"] else {}
+                response = requests.get(url, headers=headers)
+                response.raise_for_status()
+                data = response.json()
+                if data.get('rows'):
+                    rows_data = [item['row'] for item in data['rows']]
+                    df = pd.json_normalize(rows_data)
+                else:
+                    raise Exception("No data available from fallback API method")
         outputs[2] = "🔍 Searching loaded data..."
         yield tuple(outputs)
         outputs[2] = f"🏁 Search complete. Found **{len(final_df)}** results."
         if dataset_key == 'inscene' and not final_df.empty:
+            # Handle image data more safely
+            try:
+                gallery_data = []
+                for _, row in final_df.iterrows():
+                    if 'image' in row:
+                        image_data = row.get('image')
+                        text_data = row.get('text', '')
+                        # Handle different image formats
+                        if hasattr(image_data, 'save'):  # PIL Image
+                            gallery_data.append((image_data, text_data))
+                        elif isinstance(image_data, str):  # Image path or URL
+                            gallery_data.append((image_data, text_data))
+                if gallery_data:
+                    outputs[1] = gr.Gallery(gallery_data, label="🖼️ Image Results", height=400)
+            except Exception as img_error:
+                outputs[2] += f"\n⚠️ Image display error: {str(img_error)}"
         yield tuple(outputs)
         if not config['is_public']:
             gr.Markdown("**Note:** This is a gated dataset. Please log in via `huggingface-cli login` in your terminal first.")
+        # Show available methods for this dataset
+        available_methods = config['methods']
+        if len(available_methods) < 5:  # Some methods missing
+            gr.Markdown(f"**Available methods:** {len(available_methods)} of 6 possible methods")
         with gr.Row():
+            access_method = gr.Radio(
+                available_methods,
+                label="🔑 Access Method",
+                value=available_methods[0] if available_methods else "💨 API (requests)"
+            )
+            query = gr.Textbox(
+                label="🔍 Search Query",
+                placeholder="Enter any text to search, or leave blank for samples..."
+            )
         fetch_button = gr.Button("🚀 Go Fetch!")
         status_output = gr.Markdown("🏁 Ready to search.")
         "If an error occurs, a detailed debug log will appear to help troubleshoot the issue."
     )
+    # Show dependency status
+    def get_dependency_status():
+        status = "### 🔧 Available Libraries:\n"
+        status += f"- **💨 API**: ✅ Always available\n"
+        status += f"- **🐼 Pandas**: ✅ Available\n"
+        status += f"- **🤗 Datasets**: {'✅ Available' if DATASETS_AVAILABLE else '❌ Not installed'}\n"
+        status += f"- **🧊 Polars**: {'✅ Available' if POLARS_AVAILABLE else '❌ Not installed'}\n"
+        status += f"- **🧊 Dask**: {'✅ Available' if DASK_AVAILABLE else '❌ Not installed'}\n"
+        status += f"- **🥐 Croissant**: {'✅ Available' if CROISSANT_AVAILABLE else '❌ Not installed'}\n"
+        status += f"- **🔑 HF Authentication**: {'✅ Available' if HF_HUB_AVAILABLE else '❌ Not installed'}\n"
+        return status
+    with gr.Accordion("🔧 Library Status & Quick Start Guide", open=False):
+        gr.Markdown(get_dependency_status())
         gr.Markdown("""
         ### 🚀 Quick Start:
         1. **🤖 Prompts Tab**: Try API method, search for "translator" or "linux"
         - **🐼 Pandas**: Full dataset access, requires login for gated datasets
         - **🤗 Datasets**: Good for streaming large datasets
         - **🧊 Polars/Dask**: Alternative fast data processing
+        - **🥐 Croissant**: Metadata-aware loading (has fallback to API)
+        ### 📦 Missing Libraries:
+        If methods are missing, install with:
+        ```bash
+        pip install datasets polars dask mlcroissant GitPython
+        ```
         """)
     with gr.Tabs():