| { | |
| "pipeline": { | |
| "name": "customer_analytics_pipeline", | |
| "description": "End-to-end customer analytics data pipeline", | |
| "version": "2.1.0", | |
| "owner": "[email protected]", | |
| "created": "2025-01-15", | |
| "schedule": "daily at 02:00 UTC" | |
| }, | |
| "sources": [ | |
| { | |
| "id": "src_salesforce", | |
| "name": "Salesforce CRM", | |
| "type": "api", | |
| "connection": { | |
| "endpoint": "https://company.salesforce.com/api/v52.0", | |
| "auth": "oauth2" | |
| }, | |
| "objects": ["Account", "Contact", "Opportunity", "Lead"], | |
| "incremental_field": "LastModifiedDate" | |
| }, | |
| { | |
| "id": "src_stripe", | |
| "name": "Stripe Payments", | |
| "type": "api", | |
| "connection": { | |
| "endpoint": "https://api.stripe.com/v1", | |
| "auth": "api_key" | |
| }, | |
| "objects": ["charges", "customers", "subscriptions", "invoices"] | |
| }, | |
| { | |
| "id": "src_postgres_app", | |
| "name": "Application Database", | |
| "type": "database", | |
| "connection": { | |
| "host": "app-db.internal", | |
| "port": 5432, | |
| "database": "production" | |
| }, | |
| "tables": ["users", "user_events", "feature_flags", "subscriptions"] | |
| }, | |
| { | |
| "id": "src_segment", | |
| "name": "Segment Events", | |
| "type": "stream", | |
| "connection": { | |
| "type": "kafka", | |
| "topic": "segment-events", | |
| "bootstrap_servers": "kafka.internal:9092" | |
| }, | |
| "events": ["page", "track", "identify"] | |
| }, | |
| { | |
| "id": "src_google_analytics", | |
| "name": "Google Analytics 4", | |
| "type": "api", | |
| "connection": { | |
| "property_id": "GA4-123456789" | |
| }, | |
| "metrics": ["sessions", "users", "conversions", "revenue"] | |
| } | |
| ], | |
| "stages": [ | |
| { | |
| "id": "extract", | |
| "name": "Data Extraction", | |
| "steps": [ | |
| { | |
| "id": "ext_salesforce", | |
| "source": "src_salesforce", | |
| "output": "landing/salesforce/", | |
| "format": "parquet", | |
| "partitions": ["date"], | |
| "mode": "incremental" | |
| }, | |
| { | |
| "id": "ext_stripe", | |
| "source": "src_stripe", | |
| "output": "landing/stripe/", | |
| "format": "parquet", | |
| "mode": "incremental" | |
| }, | |
| { | |
| "id": "ext_postgres", | |
| "source": "src_postgres_app", | |
| "output": "landing/app_db/", | |
| "format": "parquet", | |
| "mode": "cdc" | |
| }, | |
| { | |
| "id": "ext_segment", | |
| "source": "src_segment", | |
| "output": "landing/segment/", | |
| "format": "parquet", | |
| "mode": "streaming" | |
| }, | |
| { | |
| "id": "ext_ga4", | |
| "source": "src_google_analytics", | |
| "output": "landing/ga4/", | |
| "format": "parquet", | |
| "mode": "batch" | |
| } | |
| ] | |
| }, | |
| { | |
| "id": "transform", | |
| "name": "Data Transformation", | |
| "steps": [ | |
| { | |
| "id": "tfm_customer_identity", | |
| "name": "Customer Identity Resolution", | |
| "inputs": ["ext_salesforce", "ext_stripe", "ext_postgres"], | |
| "output": "curated/customer_identity/", | |
| "logic": "Match and merge customer identities across systems using email, phone, and probabilistic matching", | |
| "technology": "Spark" | |
| }, | |
| { | |
| "id": "tfm_event_enrichment", | |
| "name": "Event Enrichment", | |
| "inputs": ["ext_segment", "ext_ga4", "tfm_customer_identity"], | |
| "output": "curated/events_enriched/", | |
| "logic": "Join events with customer identity and add session context" | |
| }, | |
| { | |
| "id": "tfm_revenue_calc", | |
| "name": "Revenue Calculation", | |
| "inputs": ["ext_stripe", "ext_salesforce", "tfm_customer_identity"], | |
| "output": "curated/revenue/", | |
| "logic": "Calculate MRR, ARR, churn, and expansion revenue metrics" | |
| }, | |
| { | |
| "id": "tfm_product_usage", | |
| "name": "Product Usage Metrics", | |
| "inputs": ["ext_postgres", "tfm_event_enrichment"], | |
| "output": "curated/product_usage/", | |
| "logic": "Aggregate product usage by customer and feature" | |
| } | |
| ] | |
| }, | |
| { | |
| "id": "model", | |
| "name": "Data Modeling", | |
| "steps": [ | |
| { | |
| "id": "mdl_dim_customer", | |
| "name": "Customer Dimension", | |
| "inputs": ["tfm_customer_identity", "tfm_revenue_calc"], | |
| "output": "warehouse.dim_customer", | |
| "type": "scd_type_2" | |
| }, | |
| { | |
| "id": "mdl_dim_product", | |
| "name": "Product Dimension", | |
| "inputs": ["ext_postgres"], | |
| "output": "warehouse.dim_product" | |
| }, | |
| { | |
| "id": "mdl_fct_events", | |
| "name": "Events Fact", | |
| "inputs": ["tfm_event_enrichment", "mdl_dim_customer", "mdl_dim_product"], | |
| "output": "warehouse.fct_events", | |
| "grain": "event" | |
| }, | |
| { | |
| "id": "mdl_fct_revenue", | |
| "name": "Revenue Fact", | |
| "inputs": ["tfm_revenue_calc", "mdl_dim_customer"], | |
| "output": "warehouse.fct_revenue", | |
| "grain": "transaction" | |
| }, | |
| { | |
| "id": "mdl_fct_usage", | |
| "name": "Usage Fact", | |
| "inputs": ["tfm_product_usage", "mdl_dim_customer", "mdl_dim_product"], | |
| "output": "warehouse.fct_usage", | |
| "grain": "daily_customer_feature" | |
| } | |
| ] | |
| }, | |
| { | |
| "id": "aggregate", | |
| "name": "Aggregations & Marts", | |
| "steps": [ | |
| { | |
| "id": "agg_customer_360", | |
| "name": "Customer 360 View", | |
| "inputs": ["mdl_dim_customer", "mdl_fct_events", "mdl_fct_revenue", "mdl_fct_usage"], | |
| "output": "marts.customer_360", | |
| "refresh": "hourly" | |
| }, | |
| { | |
| "id": "agg_revenue_metrics", | |
| "name": "Revenue Metrics", | |
| "inputs": ["mdl_fct_revenue", "mdl_dim_customer"], | |
| "output": "marts.revenue_metrics", | |
| "refresh": "daily" | |
| }, | |
| { | |
| "id": "agg_product_analytics", | |
| "name": "Product Analytics", | |
| "inputs": ["mdl_fct_usage", "mdl_fct_events", "mdl_dim_product"], | |
| "output": "marts.product_analytics", | |
| "refresh": "daily" | |
| }, | |
| { | |
| "id": "agg_health_score", | |
| "name": "Customer Health Score", | |
| "inputs": ["agg_customer_360", "agg_revenue_metrics", "agg_product_analytics"], | |
| "output": "marts.customer_health_score", | |
| "logic": "ML-based health score prediction" | |
| } | |
| ] | |
| }, | |
| { | |
| "id": "publish", | |
| "name": "Data Publishing", | |
| "steps": [ | |
| { | |
| "id": "pub_looker", | |
| "name": "Looker Semantic Layer", | |
| "inputs": ["agg_customer_360", "agg_revenue_metrics", "agg_product_analytics"], | |
| "output": "looker://models/customer_analytics", | |
| "type": "semantic_model" | |
| }, | |
| { | |
| "id": "pub_salesforce_sync", | |
| "name": "Salesforce Sync", | |
| "inputs": ["agg_customer_360", "agg_health_score"], | |
| "output": "salesforce://Account.HealthScore__c", | |
| "type": "reverse_etl" | |
| }, | |
| { | |
| "id": "pub_ml_features", | |
| "name": "ML Feature Store", | |
| "inputs": ["agg_customer_360", "agg_product_analytics"], | |
| "output": "feast://customer_features", | |
| "type": "feature_store" | |
| } | |
| ] | |
| } | |
| ], | |
| "data_quality": { | |
| "rules": [ | |
| {"table": "mdl_dim_customer", "check": "unique", "column": "customer_id"}, | |
| {"table": "mdl_fct_revenue", "check": "not_null", "columns": ["customer_id", "amount", "transaction_date"]}, | |
| {"table": "agg_revenue_metrics", "check": "freshness", "max_delay_hours": 2} | |
| ] | |
| }, | |
| "notes": "Comprehensive ETL pipeline sample showing data flow from multiple sources through transformation, modeling, and publishing stages." | |
| } | |