Spaces:
Sleeping
Sleeping
| # ============================================================================= | |
| # Marimo Notebook Template: Lazy Load & Interactively View a Hugging Face Parquet Dataset | |
| # ============================================================================= | |
| # This template demonstrates how to: | |
| # • Lazy load a Hugging Face dataset from all directories using a recursive globbing | |
| # pattern for Parquet files. | |
| # • Preview the loaded DataFrame along with metadata using a custom command. | |
| # • Provide an interactive button to expand the DataFrame view. | |
| # • (Optionally) Read local JSONL files (commented out). | |
| # | |
| # Note: According to the Polars documentation, you can read multiple files with: | |
| # pl.read_parquet("hf://datasets/{username}/{dataset}/{path_to_file}") | |
| # and globbing patterns such as "**/*.parquet" work to query all files recursively. | |
| # | |
| # Install dependencies with: | |
| # pip install polars marimo | |
| # ============================================================================= | |
| import polars as pl | |
| import marimo as mo # Marimo provides UI and lazy-loading decorators | |
| # ------------------------------------------------------------------------------ | |
| # 2. Lazy Load the Dataset | |
| # | |
| # Use the recursive globbing pattern "**/*.parquet" to read all Parquet files | |
| # from all subdirectories on Hugging Face. | |
| # ------------------------------------------------------------------------------ | |
| dataset_url = "hf://datasets/cicero-im/processed_prompt1/**/*.parquet" | |
| # Use Marimo's lazy decorator to defer data loading until needed. | |
| def load_dataset(): | |
| # Load all Parquet files matching the recursive pattern. | |
| df = pl.read_parquet(dataset_url) | |
| # Uncomment the next line to read local JSONL files instead: | |
| # df = pl.read_ndjson("/local/path/to/*.jsonl") | |
| return df | |
| # Calling load_dataset() returns a lazy DataFrame that is materialized on demand. | |
| df = load_dataset() | |
| # ------------------------------------------------------------------------------ | |
| # 3. Preview the DataFrame | |
| # | |
| # Define a custom command to preview the DataFrame with metadata. | |
| # mo.ui.table is assumed to render a rich interactive table. | |
| # ------------------------------------------------------------------------------ | |
| def preview_dataframe(df: pl.DataFrame): | |
| # Display a preview (first few rows) along with metadata (e.g., row count, column names). | |
| return mo.ui.table(df.head(), metadata=True) | |
| # Obtain and render the preview. | |
| preview = preview_dataframe(df) | |
| preview | |
| # ------------------------------------------------------------------------------ | |
| # 4. Expand the DataFrame for Better Visualization | |
| # | |
| # Create an interactive button that, when clicked, renders the full DataFrame | |
| # with expanded display options (e.g. full width). | |
| # ------------------------------------------------------------------------------ | |
| expand_option = mo.ui.button(label="Expand Dataframe") | |
| def expand_dataframe(): | |
| # Render the complete DataFrame view using the UI table component. | |
| # Adjust display parameters such as width and height. | |
| mo.ui.table(df, width="100%", height="auto") | |
| # Render the expand button. | |
| expand_option | |
| # ------------------------------------------------------------------------------ | |
| # 5. Commented-Out Formulas for Column Selection | |
| # | |
| # The following examples (commented out) demonstrate different column selection techniques: | |
| # | |
| # Example 1: Select specific columns by name: | |
| # selected_columns_df = df.select(["column1", "column2"]) | |
| # | |
| # Example 2: Select all columns except column 'a': | |
| # all_except_a_df = df.select(pl.exclude("a")) | |
| # | |
| # Example 3: Select a range of columns (e.g., from the second to the fourth column): | |
| # range_columns_df = df.select(pl.col(df.columns[1:4])) | |
| # ------------------------------------------------------------------------------ |