Spaces:

MCP-1st-Birthday
/

gaia

Running

App Files Files Community

bstraehle commited on 10 days ago

Commit

bdff22f

verified ·

1 Parent(s): d919ce7

Upload utils.py

Browse files

Files changed (1) hide show

utils/utils.py +111 -0

utils/utils.py CHANGED Viewed

	@@ -0,0 +1,111 @@

+import os
+import pandas as pd
+from docx import Document
+from pptx import Presentation
+from datasets import load_dataset
+# Dataset configuration
+SPACE_AUTHOR_NAME = os.getenv("SPACE_AUTHOR_NAME")
+DATASET_TYPE_GAIA      = "gaia"
+DATASET_TYPE_HLE       = "hle"
+DATASET_FILE_PATH_GAIA = "files/gaia_validation.jsonl"
+DATASET_FILE_PATH_HLE  = "files/hle_validation.jsonl"
+# Dataset processing
+def get_dataset_from_file(dataset_type, level):
+    file_path = ""
+    if dataset_type == DATASET_TYPE_GAIA:
+        file_path = DATASET_FILE_PATH_GAIA
+    elif dataset_type == DATASET_TYPE_HLE:
+        file_path = DATASET_FILE_PATH_HLE
+    df = pd.read_json(file_path, lines=True)
+    df = df[df["Level"] == level]
+    result=[]
+    for _, row in df.iterrows():
+        result.append([row["Question"], row["Final answer"], row["file_name"]])
+    return result
+def get_dataset(dataset_type, level):
+    dataset_repo = f"{SPACE_AUTHOR_NAME}/validation"
+    dataset = load_dataset(dataset_repo, split="validation")
+    df = dataset.to_pandas()
+    if dataset_type == DATASET_TYPE_GAIA:
+        df = df[df["Level"].isin([1, 2, 3])]
+    elif dataset_type == DATASET_TYPE_HLE:
+        df = df[df["Level"] == 0]
+    df = df[df["Level"] == level]
+    result=[]
+    for _, row in df.iterrows():
+        result.append([row["Question"], row["Final answer"], row["file_name"]])
+    return result
+# File processing
+def is_ext(file_path, ext):
+    return os.path.splitext(file_path)[1].lower() == ext.lower()
+def read_file_json(file_path):
+    df = None
+    if is_ext(file_path, ".csv"):
+        df = pd.read_csv(file_path)
+    elif is_ext(file_path, ".xls") or is_ext(file_path, ".xlsx"):
+        df = pd.read_excel(file_path)
+    elif is_ext(file_path, ".json") or is_ext(file_path, ".jsonl"):
+        df = pd.read_json(file_path)
+    return "" if df is None else df.to_json()
+def read_docx_text(file_path):
+    doc = Document(file_path)
+    text = []
+    for block in doc.element.body:
+        if block.tag.endswith("p"):
+            for paragraph in doc.paragraphs:
+                if paragraph._element == block:
+                    if paragraph.style.name.startswith("Heading"):
+                        text.append("\n**" + paragraph.text + "**\n")
+                    elif paragraph.text:
+                        text.append(paragraph.text)
+        elif block.tag.endswith("tbl"):
+            for table in doc.tables:
+                if table._element == block:
+                    for row in table.rows:
+                        row_text = []
+                        for cell in row.cells:
+                            row_text.append(cell.text.strip())
+                        text.append(" | ".join(row_text))
+    return "\n".join(text)
+def read_pptx_text(file_path):
+    prs = Presentation(file_path)
+    text = []
+    for slide in prs.slides:
+        slide_text = []
+        for shape in slide.shapes:
+            if hasattr(shape, "text"):
+                slide_text.append(shape.text)
+        text.append("\n".join(slide_text))
+    return "\n\n".join(text)