Spaces:
Sleeping
Sleeping
| import os | |
| import polars as pl | |
| import marimo | |
| __generated_with = "0.10.15" | |
| app = marimo.App(app_title="Polars & Hugging Face Data Exploration", css_file="../custom.css") | |
| # ============================================================================= | |
| # Intro Cell | |
| # ============================================================================= | |
| def introduction(mo): | |
| mo.md( | |
| r""" | |
| # Exploring a Hugging Face Dataset with Polars | |
| In this notebook we demonstrate how to: | |
| - **Lazy-load** a Hugging Face dataset (all Parquet files using a recursive globbing pattern). | |
| - **Preview** the loaded DataFrame with metadata. | |
| - **Interactively expand** the DataFrame view. | |
| - Explore over 30 additional examples of Polars I/O functions and DataFrame manipulations—especially for handling large text data. | |
| **Prerequisites:** | |
| - Install dependencies via: | |
| ```bash | |
| pip install polars marimo | |
| ``` | |
| - Make sure your Hugging Face API token is available in the `HF_TOKEN` environment variable. | |
|  | |
| """ | |
| ) | |
| return | |
| # ============================================================================= | |
| # Load HF_TOKEN from the environment | |
| # ============================================================================= | |
| def load_token(mo): | |
| hf_token = os.environ.get("HF_TOKEN") | |
| mo.md(f""" | |
| **Hugging Face Token:** `{hf_token}` | |
| *(Ensure that HF_TOKEN is set in your environment.)* | |
| """) | |
| return | |
| # ============================================================================= | |
| # 1. Lazy-load the Dataset | |
| # ============================================================================= | |
| def lazy_load_dataset(mo, pl): | |
| # Use a recursive globbing pattern to load all Parquet files from all subdirectories. | |
| dataset_url = "hf://datasets/cicero-im/processed_prompt1/**/*.parquet" | |
| # The mo.lazy decorator defers execution until the data is needed. | |
| def load_dataset(): | |
| # Load all Parquet files matching the recursive pattern. | |
| df = pl.read_parquet(dataset_url) | |
| # --- Alternative for local JSONL files (uncomment if needed): | |
| # df = pl.read_ndjson("/local/path/to/*.jsonl") | |
| return df | |
| df = load_dataset() | |
| return df | |
| # ============================================================================= | |
| # 2. Preview the DataFrame with Metadata | |
| # ============================================================================= | |
| def preview_data(mo, lazy_load_dataset, pl): | |
| df = lazy_load_dataset # LazyFrame returned by load_dataset | |
| preview = mo.ui.table(df.head(), metadata=True) | |
| mo.md( | |
| r""" | |
| ## Data Preview | |
| Below is a preview of the first few rows along with basic metadata. | |
| """ | |
| ) | |
| return preview | |
| # ============================================================================= | |
| # 3. Expand the DataFrame for Better Visualization | |
| # ============================================================================= | |
| def expand_view(mo, lazy_load_dataset, pl): | |
| df = lazy_load_dataset | |
| expand_button = mo.ui.button(label="Expand Dataframe") | |
| def on_expand(): | |
| mo.ui.table(df, width="100%", height="auto") | |
| mo.md( | |
| r""" | |
| ## Expand Dataframe | |
| Click the button below to expand the DataFrame view. | |
| """ | |
| ) | |
| return expand_button | |
| # ============================================================================= | |
| # 4. Column Selection Tips (as Markdown) | |
| # ============================================================================= | |
| def column_selection_tips(mo): | |
| mo.md( | |
| r""" | |
| ## Column Selection Tips | |
| **Example 1: Select specific columns by name:** | |
| ```python | |
| selected_columns_df = df.select(["column1", "column2"]) | |
| ``` | |
| **Example 2: Select all columns except column 'a':** | |
| ```python | |
| all_except_a_df = df.select(pl.exclude("a")) | |
| ``` | |
| **Example 3: Select a range of columns (e.g., from the 2nd to the 4th column):** | |
| ```python | |
| range_columns_df = df.select(pl.col(df.columns[1:4])) | |
| ``` | |
| """ | |
| ) | |
| return | |
| # ============================================================================= | |
| # Additional Polars I/O and DataFrame Examples (Markdown Cells) | |
| # ============================================================================= | |
| def example_1(mo): | |
| mo.md( | |
| r""" | |
| ### Example 1: Eagerly Read a Single Parquet File | |
| ```python | |
| df = pl.read_parquet("hf://datasets/roneneldan/TinyStories/data/train-00000-of-00004-2d5a1467fff1081b.parquet") | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_2(mo): | |
| mo.md( | |
| r""" | |
| ### Example 2: Read Multiple Parquet Files Using Globbing | |
| ```python | |
| df = pl.read_parquet("hf://datasets/roneneldan/TinyStories/data/train-*.parquet") | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_3(mo): | |
| mo.md( | |
| r""" | |
| ### Example 3: Lazily Scan Parquet Files with Recursive Globbing | |
| ```python | |
| df_lazy = pl.scan_parquet("hf://datasets/cicero-im/processed_prompt1/**/*.parquet") | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_4(mo): | |
| mo.md( | |
| r""" | |
| ### Example 4: Read a JSON File into a DataFrame | |
| ```python | |
| df_json = pl.read_json("data/sample.json") | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_5(mo): | |
| mo.md( | |
| r""" | |
| ### Example 5: Read JSON with a Specified Schema | |
| ```python | |
| schema = {"name": pl.Utf8, "age": pl.Int64} | |
| df_json = pl.read_json("data/sample.json", schema=schema) | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_6(mo): | |
| mo.md( | |
| r""" | |
| ### Example 6: Write a DataFrame to NDJSON Format | |
| ```python | |
| df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, 7, 8]}) | |
| ndjson_str = df.write_ndjson() | |
| print(ndjson_str) | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_7(mo): | |
| mo.md( | |
| r""" | |
| ### Example 7: Get the Schema of a Parquet File Without Reading Data | |
| ```python | |
| schema = pl.read_parquet_schema("hf://datasets/roneneldan/TinyStories/data/train-00000-of-00004-2d5a1467fff1081b.parquet") | |
| print(schema) | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_8(mo): | |
| mo.md( | |
| r""" | |
| ### Example 8: Scan Parquet Files with Hive Partitioning Enabled | |
| ```python | |
| df = pl.scan_parquet("hf://datasets/myuser/my-dataset/data/**/*.parquet", hive_partitioning=True) | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_9(mo): | |
| mo.md( | |
| r""" | |
| ### Example 9: Lazily Scan NDJSON Files Using Globbing | |
| ```python | |
| df_lazy = pl.scan_ndjson("data/*.jsonl") | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_10(mo): | |
| mo.md( | |
| r""" | |
| ### Example 10: Write a DataFrame to Partitioned Parquet Files | |
| ```python | |
| df = pl.DataFrame({"date": ["2025-01-01", "2025-01-02"], "value": [100, 200]}) | |
| df.write_parquet("output/", partition_by=["date"]) | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_11(mo): | |
| mo.md( | |
| r""" | |
| ### Example 11: Read JSON with Custom Inference Length | |
| ```python | |
| df = pl.read_json("data/large_text.json", infer_schema_length=500) | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_12(mo): | |
| mo.md( | |
| r""" | |
| ### Example 12: Read JSON with Schema Overrides | |
| ```python | |
| schema = {"id": pl.Int64, "text": pl.Utf8} | |
| overrides = {"id": pl.Int32} | |
| df = pl.read_json("data/large_text.json", schema=schema, schema_overrides=overrides) | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_13(mo): | |
| mo.md( | |
| r""" | |
| ### Example 13: Write a DataFrame to NDJSON and Return as String | |
| ```python | |
| df = pl.DataFrame({"foo": [1,2,3], "bar": [4,5,6]}) | |
| ndjson_output = df.write_ndjson() | |
| print(ndjson_output) | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_14(mo): | |
| mo.md( | |
| r""" | |
| ### Example 14: Scan Parquet Files with Cloud Storage Options | |
| ```python | |
| storage_options = {"token": os.environ.get("HF_TOKEN")} | |
| df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", storage_options=storage_options) | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_15(mo): | |
| mo.md( | |
| r""" | |
| ### Example 15: Scan NDJSON Files with Cloud Storage Options | |
| ```python | |
| storage_options = {"token": os.environ.get("HF_TOKEN")} | |
| df_lazy = pl.scan_ndjson("hf://datasets/myuser/my-dataset/**/*.jsonl", storage_options=storage_options) | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_16(mo): | |
| mo.md( | |
| r""" | |
| ### Example 16: Predicate Pushdown Example | |
| ```python | |
| df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") | |
| # Only load rows where 'value' > 100 | |
| df_filtered = df_lazy.filter(pl.col("value") > 100) | |
| result = df_filtered.collect() | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_17(mo): | |
| mo.md( | |
| r""" | |
| ### Example 17: Projection Pushdown Example | |
| ```python | |
| df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") | |
| # Only select the 'text' and 'id' columns to reduce memory footprint | |
| df_proj = df_lazy.select(["id", "text"]) | |
| result = df_proj.collect() | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_18(mo): | |
| mo.md( | |
| r""" | |
| ### Example 18: Collecting a Lazy DataFrame | |
| ```python | |
| df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") | |
| # Perform lazy operations... | |
| result = df_lazy.collect() | |
| print(result) | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_19(mo): | |
| mo.md( | |
| r""" | |
| ### Example 19: Filtering on a Large Text Column | |
| ```python | |
| df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") | |
| # Filter rows where the 'text' column contains a long string pattern | |
| df_filtered = df.filter(pl.col("text").str.contains("important keyword")) | |
| print(df_filtered.head()) | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_20(mo): | |
| mo.md( | |
| r""" | |
| ### Example 20: Using String Length on a Text Column | |
| ```python | |
| df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") | |
| # Compute the length of text in the 'text' column | |
| df = df.with_columns(text_length=pl.col("text").str.len()) | |
| print(df.head()) | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_21(mo): | |
| mo.md( | |
| r""" | |
| ### Example 21: Grouping by a Large Text Field | |
| ```python | |
| df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") | |
| grouped = df.group_by("category").agg(pl.col("text").str.len().mean().alias("avg_text_length")) | |
| print(grouped.collect()) | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_22(mo): | |
| mo.md( | |
| r""" | |
| ### Example 22: Joining Two DataFrames on a Common Key | |
| ```python | |
| df1 = pl.DataFrame({"id": [1,2,3], "text": ["A", "B", "C"]}) | |
| df2 = pl.DataFrame({"id": [1,2,3], "value": [100, 200, 300]}) | |
| joined = df1.join(df2, on="id") | |
| print(joined) | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_23(mo): | |
| mo.md( | |
| r""" | |
| ### Example 23: Using join_asof for Time-based Joins | |
| ```python | |
| df1 = pl.DataFrame({ | |
| "time": pl.date_range(low="2025-01-01", high="2025-01-02", interval="1h"), | |
| "text": ["sample text"] * 25 | |
| }) | |
| df2 = pl.DataFrame({ | |
| "time": pl.date_range(low="2025-01-01 00:30", high="2025-01-02", interval="1h"), | |
| "value": list(range(25)) | |
| }) | |
| # Perform an asof join to match the nearest timestamp | |
| joined = df1.sort("time").join_asof(df2.sort("time"), on="time") | |
| print(joined) | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_24(mo): | |
| mo.md( | |
| r""" | |
| ### Example 24: Reading a Parquet File with Low Memory Option | |
| ```python | |
| df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", low_memory=True) | |
| print(df.head()) | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_25(mo): | |
| mo.md( | |
| r""" | |
| ### Example 25: Scanning Parquet Files with a Parallel Strategy | |
| ```python | |
| df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", parallel="auto") | |
| result = df_lazy.collect() | |
| print(result) | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_26(mo): | |
| mo.md( | |
| r""" | |
| ### Example 26: Reading a Large JSON File into a DataFrame | |
| ```python | |
| df = pl.read_json("data/large_text.json", infer_schema_length=200) | |
| print(df.head()) | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_27(mo): | |
| mo.md( | |
| r""" | |
| ### Example 27: Using DataFrame.head() on a Large Text Dataset | |
| ```python | |
| df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") | |
| print(df.head(10)) | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_28(mo): | |
| mo.md( | |
| r""" | |
| ### Example 28: Using DataFrame.tail() on a Large Text Dataset | |
| ```python | |
| df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") | |
| print(df.tail(10)) | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_29(mo): | |
| mo.md( | |
| r""" | |
| ### Example 29: Scanning NDJSON Files with Rechunking | |
| ```python | |
| df_lazy = pl.scan_ndjson("data/*.jsonl", rechunk=True) | |
| result = df_lazy.collect() | |
| print(result) | |
| ``` | |
| """ | |
| ) | |
| return | |
| def example_30(mo): | |
| mo.md( | |
| r""" | |
| ### Example 30: Scanning Parquet Files with Allowing Missing Columns | |
| ```python | |
| df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", allow_missing_columns=True) | |
| result = df_lazy.collect() | |
| print(result) | |
| ``` | |
| """ | |
| ) | |
| return | |
| # ============================================================================= | |
| # End of Notebook | |
| # ============================================================================= | |
| def conclusion(mo): | |
| mo.md( | |
| r""" | |
| # Conclusion | |
| This notebook showcased: | |
| - How to lazy-load a Hugging Face dataset using Polars with recursive globbing. | |
| - How to preview and interactively expand the DataFrame. | |
| - Over 30 examples covering various Polars I/O functions and DataFrame operations, | |
| which are especially useful when working with large text data. | |
| For more information, please refer to: | |
| - [Polars Documentation](https://docs.pola.rs/) | |
| - [Hugging Face Hub Documentation](https://huggingface.co/docs) | |
| - [Marimo Notebook Documentation](https://marimo.io/) | |
| Happy Data Exploring! | |
| """ | |
| ) | |
| return | |
| if __name__ == "__main__": | |
| app.run() |