{ "pipeline": { "name": "customer_analytics_pipeline", "description": "End-to-end customer analytics data pipeline", "version": "2.1.0", "owner": "data-engineering@company.com", "created": "2025-01-15", "schedule": "daily at 02:00 UTC" }, "sources": [ { "id": "src_salesforce", "name": "Salesforce CRM", "type": "api", "connection": { "endpoint": "https://company.salesforce.com/api/v52.0", "auth": "oauth2" }, "objects": ["Account", "Contact", "Opportunity", "Lead"], "incremental_field": "LastModifiedDate" }, { "id": "src_stripe", "name": "Stripe Payments", "type": "api", "connection": { "endpoint": "https://api.stripe.com/v1", "auth": "api_key" }, "objects": ["charges", "customers", "subscriptions", "invoices"] }, { "id": "src_postgres_app", "name": "Application Database", "type": "database", "connection": { "host": "app-db.internal", "port": 5432, "database": "production" }, "tables": ["users", "user_events", "feature_flags", "subscriptions"] }, { "id": "src_segment", "name": "Segment Events", "type": "stream", "connection": { "type": "kafka", "topic": "segment-events", "bootstrap_servers": "kafka.internal:9092" }, "events": ["page", "track", "identify"] }, { "id": "src_google_analytics", "name": "Google Analytics 4", "type": "api", "connection": { "property_id": "GA4-123456789" }, "metrics": ["sessions", "users", "conversions", "revenue"] } ], "stages": [ { "id": "extract", "name": "Data Extraction", "steps": [ { "id": "ext_salesforce", "source": "src_salesforce", "output": "landing/salesforce/", "format": "parquet", "partitions": ["date"], "mode": "incremental" }, { "id": "ext_stripe", "source": "src_stripe", "output": "landing/stripe/", "format": "parquet", "mode": "incremental" }, { "id": "ext_postgres", "source": "src_postgres_app", "output": "landing/app_db/", "format": "parquet", "mode": "cdc" }, { "id": "ext_segment", "source": "src_segment", "output": "landing/segment/", "format": "parquet", "mode": "streaming" }, { "id": "ext_ga4", "source": "src_google_analytics", "output": "landing/ga4/", "format": "parquet", "mode": "batch" } ] }, { "id": "transform", "name": "Data Transformation", "steps": [ { "id": "tfm_customer_identity", "name": "Customer Identity Resolution", "inputs": ["ext_salesforce", "ext_stripe", "ext_postgres"], "output": "curated/customer_identity/", "logic": "Match and merge customer identities across systems using email, phone, and probabilistic matching", "technology": "Spark" }, { "id": "tfm_event_enrichment", "name": "Event Enrichment", "inputs": ["ext_segment", "ext_ga4", "tfm_customer_identity"], "output": "curated/events_enriched/", "logic": "Join events with customer identity and add session context" }, { "id": "tfm_revenue_calc", "name": "Revenue Calculation", "inputs": ["ext_stripe", "ext_salesforce", "tfm_customer_identity"], "output": "curated/revenue/", "logic": "Calculate MRR, ARR, churn, and expansion revenue metrics" }, { "id": "tfm_product_usage", "name": "Product Usage Metrics", "inputs": ["ext_postgres", "tfm_event_enrichment"], "output": "curated/product_usage/", "logic": "Aggregate product usage by customer and feature" } ] }, { "id": "model", "name": "Data Modeling", "steps": [ { "id": "mdl_dim_customer", "name": "Customer Dimension", "inputs": ["tfm_customer_identity", "tfm_revenue_calc"], "output": "warehouse.dim_customer", "type": "scd_type_2" }, { "id": "mdl_dim_product", "name": "Product Dimension", "inputs": ["ext_postgres"], "output": "warehouse.dim_product" }, { "id": "mdl_fct_events", "name": "Events Fact", "inputs": ["tfm_event_enrichment", "mdl_dim_customer", "mdl_dim_product"], "output": "warehouse.fct_events", "grain": "event" }, { "id": "mdl_fct_revenue", "name": "Revenue Fact", "inputs": ["tfm_revenue_calc", "mdl_dim_customer"], "output": "warehouse.fct_revenue", "grain": "transaction" }, { "id": "mdl_fct_usage", "name": "Usage Fact", "inputs": ["tfm_product_usage", "mdl_dim_customer", "mdl_dim_product"], "output": "warehouse.fct_usage", "grain": "daily_customer_feature" } ] }, { "id": "aggregate", "name": "Aggregations & Marts", "steps": [ { "id": "agg_customer_360", "name": "Customer 360 View", "inputs": ["mdl_dim_customer", "mdl_fct_events", "mdl_fct_revenue", "mdl_fct_usage"], "output": "marts.customer_360", "refresh": "hourly" }, { "id": "agg_revenue_metrics", "name": "Revenue Metrics", "inputs": ["mdl_fct_revenue", "mdl_dim_customer"], "output": "marts.revenue_metrics", "refresh": "daily" }, { "id": "agg_product_analytics", "name": "Product Analytics", "inputs": ["mdl_fct_usage", "mdl_fct_events", "mdl_dim_product"], "output": "marts.product_analytics", "refresh": "daily" }, { "id": "agg_health_score", "name": "Customer Health Score", "inputs": ["agg_customer_360", "agg_revenue_metrics", "agg_product_analytics"], "output": "marts.customer_health_score", "logic": "ML-based health score prediction" } ] }, { "id": "publish", "name": "Data Publishing", "steps": [ { "id": "pub_looker", "name": "Looker Semantic Layer", "inputs": ["agg_customer_360", "agg_revenue_metrics", "agg_product_analytics"], "output": "looker://models/customer_analytics", "type": "semantic_model" }, { "id": "pub_salesforce_sync", "name": "Salesforce Sync", "inputs": ["agg_customer_360", "agg_health_score"], "output": "salesforce://Account.HealthScore__c", "type": "reverse_etl" }, { "id": "pub_ml_features", "name": "ML Feature Store", "inputs": ["agg_customer_360", "agg_product_analytics"], "output": "feast://customer_features", "type": "feature_store" } ] } ], "data_quality": { "rules": [ {"table": "mdl_dim_customer", "check": "unique", "column": "customer_id"}, {"table": "mdl_fct_revenue", "check": "not_null", "columns": ["customer_id", "amount", "transaction_date"]}, {"table": "agg_revenue_metrics", "check": "freshness", "max_delay_hours": 2} ] }, "notes": "Comprehensive ETL pipeline sample showing data flow from multiple sources through transformation, modeling, and publishing stages." }