Lineage-graph-accelerator / samples /etl_pipeline_sample.json
aamanlamba's picture
Phase 2: Enhanced lineage extraction with export to data catalogs
0510038
{
"pipeline": {
"name": "customer_analytics_pipeline",
"description": "End-to-end customer analytics data pipeline",
"version": "2.1.0",
"owner": "[email protected]",
"created": "2025-01-15",
"schedule": "daily at 02:00 UTC"
},
"sources": [
{
"id": "src_salesforce",
"name": "Salesforce CRM",
"type": "api",
"connection": {
"endpoint": "https://company.salesforce.com/api/v52.0",
"auth": "oauth2"
},
"objects": ["Account", "Contact", "Opportunity", "Lead"],
"incremental_field": "LastModifiedDate"
},
{
"id": "src_stripe",
"name": "Stripe Payments",
"type": "api",
"connection": {
"endpoint": "https://api.stripe.com/v1",
"auth": "api_key"
},
"objects": ["charges", "customers", "subscriptions", "invoices"]
},
{
"id": "src_postgres_app",
"name": "Application Database",
"type": "database",
"connection": {
"host": "app-db.internal",
"port": 5432,
"database": "production"
},
"tables": ["users", "user_events", "feature_flags", "subscriptions"]
},
{
"id": "src_segment",
"name": "Segment Events",
"type": "stream",
"connection": {
"type": "kafka",
"topic": "segment-events",
"bootstrap_servers": "kafka.internal:9092"
},
"events": ["page", "track", "identify"]
},
{
"id": "src_google_analytics",
"name": "Google Analytics 4",
"type": "api",
"connection": {
"property_id": "GA4-123456789"
},
"metrics": ["sessions", "users", "conversions", "revenue"]
}
],
"stages": [
{
"id": "extract",
"name": "Data Extraction",
"steps": [
{
"id": "ext_salesforce",
"source": "src_salesforce",
"output": "landing/salesforce/",
"format": "parquet",
"partitions": ["date"],
"mode": "incremental"
},
{
"id": "ext_stripe",
"source": "src_stripe",
"output": "landing/stripe/",
"format": "parquet",
"mode": "incremental"
},
{
"id": "ext_postgres",
"source": "src_postgres_app",
"output": "landing/app_db/",
"format": "parquet",
"mode": "cdc"
},
{
"id": "ext_segment",
"source": "src_segment",
"output": "landing/segment/",
"format": "parquet",
"mode": "streaming"
},
{
"id": "ext_ga4",
"source": "src_google_analytics",
"output": "landing/ga4/",
"format": "parquet",
"mode": "batch"
}
]
},
{
"id": "transform",
"name": "Data Transformation",
"steps": [
{
"id": "tfm_customer_identity",
"name": "Customer Identity Resolution",
"inputs": ["ext_salesforce", "ext_stripe", "ext_postgres"],
"output": "curated/customer_identity/",
"logic": "Match and merge customer identities across systems using email, phone, and probabilistic matching",
"technology": "Spark"
},
{
"id": "tfm_event_enrichment",
"name": "Event Enrichment",
"inputs": ["ext_segment", "ext_ga4", "tfm_customer_identity"],
"output": "curated/events_enriched/",
"logic": "Join events with customer identity and add session context"
},
{
"id": "tfm_revenue_calc",
"name": "Revenue Calculation",
"inputs": ["ext_stripe", "ext_salesforce", "tfm_customer_identity"],
"output": "curated/revenue/",
"logic": "Calculate MRR, ARR, churn, and expansion revenue metrics"
},
{
"id": "tfm_product_usage",
"name": "Product Usage Metrics",
"inputs": ["ext_postgres", "tfm_event_enrichment"],
"output": "curated/product_usage/",
"logic": "Aggregate product usage by customer and feature"
}
]
},
{
"id": "model",
"name": "Data Modeling",
"steps": [
{
"id": "mdl_dim_customer",
"name": "Customer Dimension",
"inputs": ["tfm_customer_identity", "tfm_revenue_calc"],
"output": "warehouse.dim_customer",
"type": "scd_type_2"
},
{
"id": "mdl_dim_product",
"name": "Product Dimension",
"inputs": ["ext_postgres"],
"output": "warehouse.dim_product"
},
{
"id": "mdl_fct_events",
"name": "Events Fact",
"inputs": ["tfm_event_enrichment", "mdl_dim_customer", "mdl_dim_product"],
"output": "warehouse.fct_events",
"grain": "event"
},
{
"id": "mdl_fct_revenue",
"name": "Revenue Fact",
"inputs": ["tfm_revenue_calc", "mdl_dim_customer"],
"output": "warehouse.fct_revenue",
"grain": "transaction"
},
{
"id": "mdl_fct_usage",
"name": "Usage Fact",
"inputs": ["tfm_product_usage", "mdl_dim_customer", "mdl_dim_product"],
"output": "warehouse.fct_usage",
"grain": "daily_customer_feature"
}
]
},
{
"id": "aggregate",
"name": "Aggregations & Marts",
"steps": [
{
"id": "agg_customer_360",
"name": "Customer 360 View",
"inputs": ["mdl_dim_customer", "mdl_fct_events", "mdl_fct_revenue", "mdl_fct_usage"],
"output": "marts.customer_360",
"refresh": "hourly"
},
{
"id": "agg_revenue_metrics",
"name": "Revenue Metrics",
"inputs": ["mdl_fct_revenue", "mdl_dim_customer"],
"output": "marts.revenue_metrics",
"refresh": "daily"
},
{
"id": "agg_product_analytics",
"name": "Product Analytics",
"inputs": ["mdl_fct_usage", "mdl_fct_events", "mdl_dim_product"],
"output": "marts.product_analytics",
"refresh": "daily"
},
{
"id": "agg_health_score",
"name": "Customer Health Score",
"inputs": ["agg_customer_360", "agg_revenue_metrics", "agg_product_analytics"],
"output": "marts.customer_health_score",
"logic": "ML-based health score prediction"
}
]
},
{
"id": "publish",
"name": "Data Publishing",
"steps": [
{
"id": "pub_looker",
"name": "Looker Semantic Layer",
"inputs": ["agg_customer_360", "agg_revenue_metrics", "agg_product_analytics"],
"output": "looker://models/customer_analytics",
"type": "semantic_model"
},
{
"id": "pub_salesforce_sync",
"name": "Salesforce Sync",
"inputs": ["agg_customer_360", "agg_health_score"],
"output": "salesforce://Account.HealthScore__c",
"type": "reverse_etl"
},
{
"id": "pub_ml_features",
"name": "ML Feature Store",
"inputs": ["agg_customer_360", "agg_product_analytics"],
"output": "feast://customer_features",
"type": "feature_store"
}
]
}
],
"data_quality": {
"rules": [
{"table": "mdl_dim_customer", "check": "unique", "column": "customer_id"},
{"table": "mdl_fct_revenue", "check": "not_null", "columns": ["customer_id", "amount", "transaction_date"]},
{"table": "agg_revenue_metrics", "check": "freshness", "max_delay_hours": 2}
]
},
"notes": "Comprehensive ETL pipeline sample showing data flow from multiple sources through transformation, modeling, and publishing stages."
}