Spaces:

cicero-im
/

marimo

Sleeping

App Files Files Community

marimo / app.py

arthrod

Update app.py

d25a3d8 verified 10 months ago

raw

history blame

15.6 kB

	import os
	import polars as pl
	import marimo

	__generated_with = "0.10.15"
	app = marimo.App(app_title="Polars & Hugging Face Data Exploration", css_file="../custom.css")

	# =============================================================================
	# Intro Cell
	# =============================================================================
	@app.cell
	def introduction(mo):
	mo.md(
	r"""
	# Exploring a Hugging Face Dataset with Polars

	In this notebook we demonstrate how to:
	- Lazy-load a Hugging Face dataset (all Parquet files using a recursive globbing pattern).
	- Preview the loaded DataFrame with metadata.
	- Interactively expand the DataFrame view.
	- Explore over 30 additional examples of Polars I/O functions and DataFrame manipulations—especially for handling large text data.

	Prerequisites:
	- Install dependencies via:
	```bash
	pip install polars marimo
	```
	- Make sure your Hugging Face API token is available in the `HF_TOKEN` environment variable.

	![Hugging Face logo](https://huggingface.co/front/assets/huggingface_logo.svg)
	"""
	)
	return

	# =============================================================================
	# Load HF_TOKEN from the environment
	# =============================================================================
	@app.cell
	def load_token(mo):
	hf_token = os.environ.get("HF_TOKEN")
	mo.md(f"""
	Hugging Face Token: `{hf_token}`
	(Ensure that HF_TOKEN is set in your environment.)
	""")
	return

	# =============================================================================
	# 1. Lazy-load the Dataset
	# =============================================================================
	@app.cell
	def lazy_load_dataset(mo, pl):
	# Use a recursive globbing pattern to load all Parquet files from all subdirectories.
	dataset_url = "hf://datasets/cicero-im/processed_prompt1/*/.parquet"

	@mo.lazy # The mo.lazy decorator defers execution until the data is needed.
	def load_dataset():
	# Load all Parquet files matching the recursive pattern.
	df = pl.read_parquet(dataset_url)
	# --- Alternative for local JSONL files (uncomment if needed):
	# df = pl.read_ndjson("/local/path/to/*.jsonl")
	return df

	df = load_dataset()
	return df

	# =============================================================================
	# 2. Preview the DataFrame with Metadata
	# =============================================================================
	@app.cell
	def preview_data(mo, lazy_load_dataset, pl):
	df = lazy_load_dataset # LazyFrame returned by load_dataset
	preview = mo.ui.table(df.head(), metadata=True)
	mo.md(
	r"""
	## Data Preview

	Below is a preview of the first few rows along with basic metadata.
	"""
	)
	return preview

	# =============================================================================
	# 3. Expand the DataFrame for Better Visualization
	# =============================================================================
	@app.cell
	def expand_view(mo, lazy_load_dataset, pl):
	df = lazy_load_dataset
	expand_button = mo.ui.button(label="Expand Dataframe")

	@expand_button.on_click
	def on_expand():
	mo.ui.table(df, width="100%", height="auto")

	mo.md(
	r"""
	## Expand Dataframe

	Click the button below to expand the DataFrame view.
	"""
	)
	return expand_button

	# =============================================================================
	# 4. Column Selection Tips (as Markdown)
	# =============================================================================
	@app.cell
	def column_selection_tips(mo):
	mo.md(
	r"""
	## Column Selection Tips

	Example 1: Select specific columns by name:
	```python
	selected_columns_df = df.select(["column1", "column2"])
	```

	Example 2: Select all columns except column 'a':
	```python
	all_except_a_df = df.select(pl.exclude("a"))
	```

	Example 3: Select a range of columns (e.g., from the 2nd to the 4th column):
	```python
	range_columns_df = df.select(pl.col(df.columns[1:4]))
	```
	"""
	)
	return

	# =============================================================================
	# Additional Polars I/O and DataFrame Examples (Markdown Cells)
	# =============================================================================

	@app.cell
	def example_1(mo):
	mo.md(
	r"""
	### Example 1: Eagerly Read a Single Parquet File

	```python
	df = pl.read_parquet("hf://datasets/roneneldan/TinyStories/data/train-00000-of-00004-2d5a1467fff1081b.parquet")
	```
	"""
	)
	return

	@app.cell
	def example_2(mo):
	mo.md(
	r"""
	### Example 2: Read Multiple Parquet Files Using Globbing

	```python
	df = pl.read_parquet("hf://datasets/roneneldan/TinyStories/data/train-*.parquet")
	```
	"""
	)
	return

	@app.cell
	def example_3(mo):
	mo.md(
	r"""
	### Example 3: Lazily Scan Parquet Files with Recursive Globbing

	```python
	df_lazy = pl.scan_parquet("hf://datasets/cicero-im/processed_prompt1/*/.parquet")
	```
	"""
	)
	return

	@app.cell
	def example_4(mo):
	mo.md(
	r"""
	### Example 4: Read a JSON File into a DataFrame

	```python
	df_json = pl.read_json("data/sample.json")
	```
	"""
	)
	return

	@app.cell
	def example_5(mo):
	mo.md(
	r"""
	### Example 5: Read JSON with a Specified Schema

	```python
	schema = {"name": pl.Utf8, "age": pl.Int64}
	df_json = pl.read_json("data/sample.json", schema=schema)
	```
	"""
	)
	return

	@app.cell
	def example_6(mo):
	mo.md(
	r"""
	### Example 6: Write a DataFrame to NDJSON Format

	```python
	df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, 7, 8]})
	ndjson_str = df.write_ndjson()
	print(ndjson_str)
	```
	"""
	)
	return

	@app.cell
	def example_7(mo):
	mo.md(
	r"""
	### Example 7: Get the Schema of a Parquet File Without Reading Data

	```python
	schema = pl.read_parquet_schema("hf://datasets/roneneldan/TinyStories/data/train-00000-of-00004-2d5a1467fff1081b.parquet")
	print(schema)
	```
	"""
	)
	return

	@app.cell
	def example_8(mo):
	mo.md(
	r"""
	### Example 8: Scan Parquet Files with Hive Partitioning Enabled

	```python
	df = pl.scan_parquet("hf://datasets/myuser/my-dataset/data/*/.parquet", hive_partitioning=True)
	```
	"""
	)
	return

	@app.cell
	def example_9(mo):
	mo.md(
	r"""
	### Example 9: Lazily Scan NDJSON Files Using Globbing

	```python
	df_lazy = pl.scan_ndjson("data/*.jsonl")
	```
	"""
	)
	return

	@app.cell
	def example_10(mo):
	mo.md(
	r"""
	### Example 10: Write a DataFrame to Partitioned Parquet Files

	```python
	df = pl.DataFrame({"date": ["2025-01-01", "2025-01-02"], "value": [100, 200]})
	df.write_parquet("output/", partition_by=["date"])
	```
	"""
	)
	return

	@app.cell
	def example_11(mo):
	mo.md(
	r"""
	### Example 11: Read JSON with Custom Inference Length

	```python
	df = pl.read_json("data/large_text.json", infer_schema_length=500)
	```
	"""
	)
	return

	@app.cell
	def example_12(mo):
	mo.md(
	r"""
	### Example 12: Read JSON with Schema Overrides

	```python
	schema = {"id": pl.Int64, "text": pl.Utf8}
	overrides = {"id": pl.Int32}
	df = pl.read_json("data/large_text.json", schema=schema, schema_overrides=overrides)
	```
	"""
	)
	return

	@app.cell
	def example_13(mo):
	mo.md(
	r"""
	### Example 13: Write a DataFrame to NDJSON and Return as String

	```python
	df = pl.DataFrame({"foo": [1,2,3], "bar": [4,5,6]})
	ndjson_output = df.write_ndjson()
	print(ndjson_output)
	```
	"""
	)
	return

	@app.cell
	def example_14(mo):
	mo.md(
	r"""
	### Example 14: Scan Parquet Files with Cloud Storage Options

	```python
	storage_options = {"token": os.environ.get("HF_TOKEN")}
	df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/*/.parquet", storage_options=storage_options)
	```
	"""
	)
	return

	@app.cell
	def example_15(mo):
	mo.md(
	r"""
	### Example 15: Scan NDJSON Files with Cloud Storage Options

	```python
	storage_options = {"token": os.environ.get("HF_TOKEN")}
	df_lazy = pl.scan_ndjson("hf://datasets/myuser/my-dataset/*/.jsonl", storage_options=storage_options)
	```
	"""
	)
	return

	@app.cell
	def example_16(mo):
	mo.md(
	r"""
	### Example 16: Predicate Pushdown Example

	```python
	df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/*/.parquet")
	# Only load rows where 'value' > 100
	df_filtered = df_lazy.filter(pl.col("value") > 100)
	result = df_filtered.collect()
	```
	"""
	)
	return

	@app.cell
	def example_17(mo):
	mo.md(
	r"""
	### Example 17: Projection Pushdown Example

	```python
	df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/*/.parquet")
	# Only select the 'text' and 'id' columns to reduce memory footprint
	df_proj = df_lazy.select(["id", "text"])
	result = df_proj.collect()
	```
	"""
	)
	return

	@app.cell
	def example_18(mo):
	mo.md(
	r"""
	### Example 18: Collecting a Lazy DataFrame

	```python
	df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/*/.parquet")
	# Perform lazy operations...
	result = df_lazy.collect()
	print(result)
	```
	"""
	)
	return

	@app.cell
	def example_19(mo):
	mo.md(
	r"""
	### Example 19: Filtering on a Large Text Column

	```python
	df = pl.read_parquet("hf://datasets/myuser/my-dataset/*/.parquet")
	# Filter rows where the 'text' column contains a long string pattern
	df_filtered = df.filter(pl.col("text").str.contains("important keyword"))
	print(df_filtered.head())
	```
	"""
	)
	return

	@app.cell
	def example_20(mo):
	mo.md(
	r"""
	### Example 20: Using String Length on a Text Column

	```python
	df = pl.read_parquet("hf://datasets/myuser/my-dataset/*/.parquet")
	# Compute the length of text in the 'text' column
	df = df.with_columns(text_length=pl.col("text").str.len())
	print(df.head())
	```
	"""
	)
	return

	@app.cell
	def example_21(mo):
	mo.md(
	r"""
	### Example 21: Grouping by a Large Text Field

	```python
	df = pl.read_parquet("hf://datasets/myuser/my-dataset/*/.parquet")
	grouped = df.group_by("category").agg(pl.col("text").str.len().mean().alias("avg_text_length"))
	print(grouped.collect())
	```
	"""
	)
	return

	@app.cell
	def example_22(mo):
	mo.md(
	r"""
	### Example 22: Joining Two DataFrames on a Common Key

	```python
	df1 = pl.DataFrame({"id": [1,2,3], "text": ["A", "B", "C"]})
	df2 = pl.DataFrame({"id": [1,2,3], "value": [100, 200, 300]})
	joined = df1.join(df2, on="id")
	print(joined)
	```
	"""
	)
	return

	@app.cell
	def example_23(mo):
	mo.md(
	r"""
	### Example 23: Using join_asof for Time-based Joins

	```python
	df1 = pl.DataFrame({
	"time": pl.date_range(low="2025-01-01", high="2025-01-02", interval="1h"),
	"text": ["sample text"] * 25
	})
	df2 = pl.DataFrame({
	"time": pl.date_range(low="2025-01-01 00:30", high="2025-01-02", interval="1h"),
	"value": list(range(25))
	})
	# Perform an asof join to match the nearest timestamp
	joined = df1.sort("time").join_asof(df2.sort("time"), on="time")
	print(joined)
	```
	"""
	)
	return

	@app.cell
	def example_24(mo):
	mo.md(
	r"""
	### Example 24: Reading a Parquet File with Low Memory Option

	```python
	df = pl.read_parquet("hf://datasets/myuser/my-dataset/*/.parquet", low_memory=True)
	print(df.head())
	```
	"""
	)
	return

	@app.cell
	def example_25(mo):
	mo.md(
	r"""
	### Example 25: Scanning Parquet Files with a Parallel Strategy

	```python
	df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/*/.parquet", parallel="auto")
	result = df_lazy.collect()
	print(result)
	```
	"""
	)
	return

	@app.cell
	def example_26(mo):
	mo.md(
	r"""
	### Example 26: Reading a Large JSON File into a DataFrame

	```python
	df = pl.read_json("data/large_text.json", infer_schema_length=200)
	print(df.head())
	```
	"""
	)
	return

	@app.cell
	def example_27(mo):
	mo.md(
	r"""
	### Example 27: Using DataFrame.head() on a Large Text Dataset

	```python
	df = pl.read_parquet("hf://datasets/myuser/my-dataset/*/.parquet")
	print(df.head(10))
	```
	"""
	)
	return

	@app.cell
	def example_28(mo):
	mo.md(
	r"""
	### Example 28: Using DataFrame.tail() on a Large Text Dataset

	```python
	df = pl.read_parquet("hf://datasets/myuser/my-dataset/*/.parquet")
	print(df.tail(10))
	```
	"""
	)
	return

	@app.cell
	def example_29(mo):
	mo.md(
	r"""
	### Example 29: Scanning NDJSON Files with Rechunking

	```python
	df_lazy = pl.scan_ndjson("data/*.jsonl", rechunk=True)
	result = df_lazy.collect()
	print(result)
	```
	"""
	)
	return

	@app.cell
	def example_30(mo):
	mo.md(
	r"""
	### Example 30: Scanning Parquet Files with Allowing Missing Columns

	```python
	df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/*/.parquet", allow_missing_columns=True)
	result = df_lazy.collect()
	print(result)
	```
	"""
	)
	return

	# =============================================================================
	# End of Notebook
	# =============================================================================
	@app.cell
	def conclusion(mo):
	mo.md(
	r"""
	# Conclusion

	This notebook showcased:
	- How to lazy-load a Hugging Face dataset using Polars with recursive globbing.
	- How to preview and interactively expand the DataFrame.
	- Over 30 examples covering various Polars I/O functions and DataFrame operations,
	which are especially useful when working with large text data.

	For more information, please refer to:
	- [Polars Documentation](https://docs.pola.rs/)
	- [Hugging Face Hub Documentation](https://huggingface.co/docs)
	- [Marimo Notebook Documentation](https://marimo.io/)

	Happy Data Exploring!
	"""
	)
	return

	if __name__ == "__main__":
	app.run()