File size: 3,811 Bytes
d398a54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# =============================================================================
# Marimo Notebook Template: Lazy Load & Interactively View a Hugging Face Parquet Dataset
# =============================================================================
# This template demonstrates how to:
#  • Lazy load a Hugging Face dataset from all directories using a recursive globbing
#    pattern for Parquet files.
#  • Preview the loaded DataFrame along with metadata using a custom command.
#  • Provide an interactive button to expand the DataFrame view.
#  • (Optionally) Read local JSONL files (commented out).
#
# Note: According to the Polars documentation, you can read multiple files with:
#       pl.read_parquet("hf://datasets/{username}/{dataset}/{path_to_file}")
#       and globbing patterns such as "**/*.parquet" work to query all files recursively.
#
# Install dependencies with:
#   pip install polars marimo
# =============================================================================

import polars as pl
import marimo as mo  # Marimo provides UI and lazy-loading decorators

# ------------------------------------------------------------------------------
# 2. Lazy Load the Dataset
#
# Use the recursive globbing pattern "**/*.parquet" to read all Parquet files
# from all subdirectories on Hugging Face.
# ------------------------------------------------------------------------------
dataset_url = "hf://datasets/cicero-im/processed_prompt1/**/*.parquet"

@mo.lazy  # Use Marimo's lazy decorator to defer data loading until needed.
def load_dataset():
    # Load all Parquet files matching the recursive pattern.
    df = pl.read_parquet(dataset_url)
    # Uncomment the next line to read local JSONL files instead:
    # df = pl.read_ndjson("/local/path/to/*.jsonl")
    return df

# Calling load_dataset() returns a lazy DataFrame that is materialized on demand.
df = load_dataset()

# ------------------------------------------------------------------------------
# 3. Preview the DataFrame
#
# Define a custom command to preview the DataFrame with metadata.
# mo.ui.table is assumed to render a rich interactive table.
# ------------------------------------------------------------------------------
def preview_dataframe(df: pl.DataFrame):
    # Display a preview (first few rows) along with metadata (e.g., row count, column names).
    return mo.ui.table(df.head(), metadata=True)

# Obtain and render the preview.
preview = preview_dataframe(df)
preview

# ------------------------------------------------------------------------------
# 4. Expand the DataFrame for Better Visualization
#
# Create an interactive button that, when clicked, renders the full DataFrame
# with expanded display options (e.g. full width).
# ------------------------------------------------------------------------------
expand_option = mo.ui.button(label="Expand Dataframe")

@expand_option.on_click
def expand_dataframe():
    # Render the complete DataFrame view using the UI table component.
    # Adjust display parameters such as width and height.
    mo.ui.table(df, width="100%", height="auto")

# Render the expand button.
expand_option

# ------------------------------------------------------------------------------
# 5. Commented-Out Formulas for Column Selection
#
# The following examples (commented out) demonstrate different column selection techniques:
#
# Example 1: Select specific columns by name:
# selected_columns_df = df.select(["column1", "column2"])
#
# Example 2: Select all columns except column 'a':
# all_except_a_df = df.select(pl.exclude("a"))
#
# Example 3: Select a range of columns (e.g., from the second to the fourth column):
# range_columns_df = df.select(pl.col(df.columns[1:4]))
# ------------------------------------------------------------------------------