bstraehle commited on
Commit
bdff22f
·
verified ·
1 Parent(s): d919ce7

Upload utils.py

Browse files
Files changed (1) hide show
  1. utils/utils.py +111 -0
utils/utils.py CHANGED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from docx import Document
4
+ from pptx import Presentation
5
+ from datasets import load_dataset
6
+
7
+ # Dataset configuration
8
+
9
+ SPACE_AUTHOR_NAME = os.getenv("SPACE_AUTHOR_NAME")
10
+
11
+ DATASET_TYPE_GAIA = "gaia"
12
+ DATASET_TYPE_HLE = "hle"
13
+
14
+ DATASET_FILE_PATH_GAIA = "files/gaia_validation.jsonl"
15
+ DATASET_FILE_PATH_HLE = "files/hle_validation.jsonl"
16
+
17
+ # Dataset processing
18
+
19
+ def get_dataset_from_file(dataset_type, level):
20
+ file_path = ""
21
+
22
+ if dataset_type == DATASET_TYPE_GAIA:
23
+ file_path = DATASET_FILE_PATH_GAIA
24
+ elif dataset_type == DATASET_TYPE_HLE:
25
+ file_path = DATASET_FILE_PATH_HLE
26
+
27
+ df = pd.read_json(file_path, lines=True)
28
+
29
+ df = df[df["Level"] == level]
30
+
31
+ result=[]
32
+
33
+ for _, row in df.iterrows():
34
+ result.append([row["Question"], row["Final answer"], row["file_name"]])
35
+
36
+ return result
37
+
38
+ def get_dataset(dataset_type, level):
39
+ dataset_repo = f"{SPACE_AUTHOR_NAME}/validation"
40
+ dataset = load_dataset(dataset_repo, split="validation")
41
+
42
+ df = dataset.to_pandas()
43
+
44
+ if dataset_type == DATASET_TYPE_GAIA:
45
+ df = df[df["Level"].isin([1, 2, 3])]
46
+ elif dataset_type == DATASET_TYPE_HLE:
47
+ df = df[df["Level"] == 0]
48
+
49
+ df = df[df["Level"] == level]
50
+
51
+ result=[]
52
+
53
+ for _, row in df.iterrows():
54
+ result.append([row["Question"], row["Final answer"], row["file_name"]])
55
+
56
+ return result
57
+
58
+ # File processing
59
+
60
+ def is_ext(file_path, ext):
61
+ return os.path.splitext(file_path)[1].lower() == ext.lower()
62
+
63
+ def read_file_json(file_path):
64
+ df = None
65
+
66
+ if is_ext(file_path, ".csv"):
67
+ df = pd.read_csv(file_path)
68
+ elif is_ext(file_path, ".xls") or is_ext(file_path, ".xlsx"):
69
+ df = pd.read_excel(file_path)
70
+ elif is_ext(file_path, ".json") or is_ext(file_path, ".jsonl"):
71
+ df = pd.read_json(file_path)
72
+
73
+ return "" if df is None else df.to_json()
74
+
75
+ def read_docx_text(file_path):
76
+ doc = Document(file_path)
77
+
78
+ text = []
79
+
80
+ for block in doc.element.body:
81
+ if block.tag.endswith("p"):
82
+ for paragraph in doc.paragraphs:
83
+ if paragraph._element == block:
84
+ if paragraph.style.name.startswith("Heading"):
85
+ text.append("\n**" + paragraph.text + "**\n")
86
+ elif paragraph.text:
87
+ text.append(paragraph.text)
88
+ elif block.tag.endswith("tbl"):
89
+ for table in doc.tables:
90
+ if table._element == block:
91
+ for row in table.rows:
92
+ row_text = []
93
+ for cell in row.cells:
94
+ row_text.append(cell.text.strip())
95
+ text.append(" | ".join(row_text))
96
+
97
+ return "\n".join(text)
98
+
99
+ def read_pptx_text(file_path):
100
+ prs = Presentation(file_path)
101
+
102
+ text = []
103
+
104
+ for slide in prs.slides:
105
+ slide_text = []
106
+ for shape in slide.shapes:
107
+ if hasattr(shape, "text"):
108
+ slide_text.append(shape.text)
109
+ text.append("\n".join(slide_text))
110
+
111
+ return "\n\n".join(text)