Add comprehensive CDF vs non-CDF processing volume demonstrations

mkhelghati-db · mkhelghati-db · commit affc923a1f67 · 2025-09-24T15:39:01.000+02:00
- Add clear explanations of CDF vs non-CDF approaches with processing volume examples
- Demonstrate actual processing volume differences with live metrics
- Show processing efficiency calculations (percentage reduction, speed improvements)
- Add real-time processing volume tracking in batch operations
- Display actual changes detected by CDF vs total table size
- Add multi-table CDF processing volume analysis per table
- Show cost impact and performance benefits of CDF processing
- Demonstrate up to 99%+ reduction in processing volume for incremental changes
- Add visual output showing records processed vs total records
- Include real-world impact examples (1K vs 1M records processing)
- Enhance both simple and multi-table demos with processing volume insights
diff --git a/product_demos/cdc-pipeline/01-CDC-CDF-simple-pipeline.py b/product_demos/cdc-pipeline/01-CDC-CDF-simple-pipeline.py
@@ -312,7 +312,7 @@ def stop_cdc_generator():
 
 # COMMAND ----------
 
-# MAGIC %sql
+# MAGIC %sql 
 # MAGIC -- Create silver table with optimized settings for serverless and CDC
 # MAGIC CREATE TABLE IF NOT EXISTS retail_client_silver (id BIGINT NOT NULL, name STRING, address STRING, email STRING, operation STRING) 
 # MAGIC   TBLPROPERTIES (
@@ -414,8 +414,24 @@ def merge_stream(df, i):
 
 # COMMAND ----------
 
-# MAGIC %md
-# MAGIC ### Step 4.1: Working with Delta Lake CDF
+# MAGIC %md 
+# MAGIC ### Step 4.1: Understanding Change Data Feed (CDF) vs Non-CDF Processing
+# MAGIC
+# MAGIC **🔍 Key Difference**: CDF only processes **actual changes**, while non-CDF processes **all data**.
+# MAGIC
+# MAGIC #### **Non-CDF Approach (Inefficient)**:
+# MAGIC - 📊 **Processes**: Entire table every time
+# MAGIC - 💰 **Cost**: High - reprocesses unchanged data
+# MAGIC - ⏱️ **Time**: Slow - scans all records
+# MAGIC - 🔄 **Example**: If table has 1M records, processes all 1M even for 1 change
+# MAGIC
+# MAGIC #### **CDF Approach (Efficient)**:
+# MAGIC - 📊 **Processes**: Only changed records
+# MAGIC - 💰 **Cost**: Low - only pays for actual changes
+# MAGIC - ⏱️ **Time**: Fast - processes only deltas
+# MAGIC - 🔄 **Example**: If table has 1M records but only 5 changed, processes only 5 records
+# MAGIC
+# MAGIC **💡 CDF Benefits**: Up to 99%+ reduction in processing volume for incremental changes!
 
 # COMMAND ----------
 
@@ -443,26 +459,62 @@ def merge_stream(df, i):
 # COMMAND ----------
 
 # MAGIC %md
-# MAGIC ## Step 4.2: Get The Latest Records Updates with Python API
+# MAGIC ### Step 4.2: Demonstrate CDF vs Non-CDF Processing Volume
+# MAGIC
+# MAGIC Let's show the actual difference in processing volume between CDF and non-CDF approaches.
 
 # COMMAND ----------
 
 from delta.tables import *
 
-#Let's get the last table version to only see the last update mofications
+# Let's demonstrate the processing volume difference
+print("🔍 Demonstrating CDF vs Non-CDF Processing Volume")
+print("=" * 60)
+
+# Get total records in silver table
+total_silver_records = spark.sql("SELECT COUNT(*) as count FROM retail_client_silver").collect()[0]['count']
+print(f"📊 Total records in Silver table: {total_silver_records:,}")
+
+# Get latest table version
 last_version = str(DeltaTable.forName(spark, "retail_client_silver").history(1).head()["version"])
-print(f"our Delta table last version is {last_version}, let's select the last changes to see our DELETE and UPDATE operations (last 2 versions):")
+print(f"📈 Latest table version: {last_version}")
 
+# Show what CDF would process (only changes from last 2 versions)
+print(f"\n🔄 CDF Processing (Efficient):")
 changes = spark.read.format("delta") \
                     .option("readChangeFeed", "true") \
                     .option("startingVersion", int(last_version) -1) \
                     .table("retail_client_silver")
-display(changes)
+
+cdf_records = changes.count()
+print(f"   📊 Records to process: {cdf_records:,}")
+print(f"   💰 Processing efficiency: {((total_silver_records - cdf_records) / total_silver_records * 100):.1f}% reduction")
+print(f"   ⚡ Speed improvement: {total_silver_records / max(cdf_records, 1):.1f}x faster")
+
+# Show what non-CDF would process (entire table)
+print(f"\n🔄 Non-CDF Processing (Inefficient):")
+print(f"   📊 Records to process: {total_silver_records:,}")
+print(f"   💰 Processing efficiency: 0% reduction (processes everything)")
+print(f"   ⚡ Speed improvement: 1x (baseline)")
+
+print(f"\n💡 Key Insight: CDF processes {cdf_records:,} records instead of {total_silver_records:,} records")
+print(f"   That's a {((total_silver_records - cdf_records) / total_silver_records * 100):.1f}% reduction in processing volume!")
+
+# Display the actual changes
+print(f"\n📋 Actual Changes Detected:")
+display(changes.select("_change_type", "id", "name", "email").orderBy("id"))
 
 # COMMAND ----------
 
 # MAGIC %md
-# MAGIC ### Step 4.3: Synchronize Gold Table with Silver Changes
+# MAGIC ### Step 4.3: Gold Layer Processing with CDF Efficiency
+# MAGIC
+# MAGIC Now let's implement the Gold layer using CDF to demonstrate the efficiency gains:
+# MAGIC
+# MAGIC **🎯 What We're Building**: Gold layer that only processes **actual changes** from Silver layer
+# MAGIC **📊 Processing Volume**: Only changed records, not entire table
+# MAGIC **💰 Cost Impact**: Significant reduction in compute costs
+# MAGIC **⚡ Performance**: Much faster processing times
 # MAGIC
 # MAGIC Let's now say that we want to perform another table enhancement and propagate these changes downstream.
 # MAGIC
@@ -474,7 +526,7 @@ def merge_stream(df, i):
 
 # COMMAND ----------
 
-# DBTITLE 1,Create Gold Table
+# DBTITLE 1,Step 4.4: Create Gold Table with Processing Volume Tracking
 # MAGIC %sql
 # MAGIC CREATE TABLE IF NOT EXISTS retail_client_gold (id BIGINT NOT NULL, name STRING, address STRING, email STRING, gold_data STRING)
 # MAGIC   TBLPROPERTIES (
@@ -489,17 +541,33 @@ def merge_stream(df, i):
 from pyspark.sql.window import Window
 from pyspark.sql.functions import dense_rank, regexp_replace, lit, col, current_timestamp
 
-#Function to upsert `microBatchOutputDF` into Delta table using MERGE
+# Function to upsert `microBatchOutputDF` into Delta table using MERGE
+# This function demonstrates CDF efficiency by processing only changed records
 def upsertToDelta(data, batchId):
-  #First we need to deduplicate based on the id and take the most recent update
+  print(f"🔄 Processing batch {batchId} with CDF efficiency...")
+  
+  # Count records being processed
+  records_to_process = data.count()
+  print(f"   📊 Records in this batch: {records_to_process:,}")
+  
+  # First we need to deduplicate based on the id and take the most recent update
   windowSpec = Window.partitionBy("id").orderBy(col("_commit_version").desc())
-  #Select only the first value 
-  #getting the latest change is still needed if the cdc contains multiple time the same id. We can rank over the id and get the most recent _commit_version
+  # Select only the first value 
+  # getting the latest change is still needed if the cdc contains multiple time the same id. We can rank over the id and get the most recent _commit_version
   data_deduplicated = data.withColumn("rank", dense_rank().over(windowSpec)).where("rank = 1 and _change_type!='update_preimage'").drop("_commit_version", "rank")
 
-  #Add some data cleaning for the gold layer to remove quotes from the address
+  # Add some data cleaning for the gold layer to remove quotes from the address
   data_deduplicated = data_deduplicated.withColumn("address", regexp_replace(col("address"), "\"", ""))
   
+  # Count deduplicated records
+  deduplicated_count = data_deduplicated.count()
+  print(f"   📊 Records after deduplication: {deduplicated_count:,}")
+  
+  # Show processing efficiency
+  if records_to_process > 0:
+    efficiency = ((records_to_process - deduplicated_count) / records_to_process * 100)
+    print(f"   💰 Deduplication efficiency: {efficiency:.1f}% reduction")
+  
   #run the merge in the gold table directly
   (DeltaTable.forName(spark, "retail_client_gold").alias("target")
       .merge(data_deduplicated.alias("source"), "source.id = target.id")
@@ -508,6 +576,12 @@ def upsertToDelta(data, batchId):
       .whenNotMatchedInsertAll("source._change_type != 'delete'")
       .execute())
 
+  print(f"   ✅ Batch {batchId} completed - processed {deduplicated_count:,} records efficiently")
+
+
+# Start the CDF stream with processing volume tracking
+print("🚀 Starting Gold layer CDF stream with processing volume tracking...")
+print("💡 This will show you exactly how many records are processed vs. total table size")
 
 (spark.readStream
        .option("readChangeFeed", "true")  # Updated to use correct option name
@@ -519,13 +593,33 @@ def upsertToDelta(data, batchId):
         .option("checkpointLocation", raw_data_location+"/stream/checkpoint_clients_gold")
         .option("mergeSchema", "true")  # Enable schema evolution for gold layer
         .trigger(availableNow=True)  # Serverless trigger for cost-effective processing
-      .start())
+      .start()
+      .awaitTermination())
 
-time.sleep(20)
+# COMMAND ----------
+
+# MAGIC %sql 
+# MAGIC -- Show the final Gold table results
+# MAGIC SELECT * FROM retail_client_gold ORDER BY id;
 
 # COMMAND ----------
 
-# MAGIC %sql SELECT * FROM retail_client_gold
+# MAGIC %md
+# MAGIC ### Step 4.5: CDF Processing Volume Summary
+# MAGIC
+# MAGIC **🎯 What We Just Demonstrated**:
+# MAGIC - **CDF Processing**: Only processed actual changes from Silver layer
+# MAGIC - **Volume Efficiency**: Dramatically reduced processing volume
+# MAGIC - **Cost Savings**: Significant reduction in compute costs
+# MAGIC - **Performance**: Much faster processing times
+# MAGIC
+# MAGIC **📊 Key Metrics**:
+# MAGIC - **Total Silver Records**: Shows full table size
+# MAGIC - **CDF Records Processed**: Shows only changed records
+# MAGIC - **Efficiency Gain**: Percentage reduction in processing volume
+# MAGIC - **Speed Improvement**: Multiplier for processing speed
+# MAGIC
+# MAGIC **💡 Real-World Impact**: In production, this can mean processing 1,000 records instead of 1,000,000 records for incremental updates!
 
 # COMMAND ----------
 
diff --git a/product_demos/cdc-pipeline/02-CDC-CDF-full-multi-tables.py b/product_demos/cdc-pipeline/02-CDC-CDF-full-multi-tables.py
@@ -67,7 +67,7 @@
 
 # COMMAND ----------
 
-# MAGIC %md
+# MAGIC %md 
 # MAGIC ## 🔄 Step 1: Set up multi-table CDC data simulation
 # MAGIC
 
@@ -313,31 +313,72 @@ def update_bronze_layer(path, bronze_table):
 # COMMAND ----------
 
 # MAGIC %md
-# MAGIC ### 3.1 Silver Layer with MERGE Operations 
+# MAGIC ### 3.1 Understanding CDF vs Non-CDF Processing in Multi-Table Scenarios
+# MAGIC
+# MAGIC **🔍 Key Difference**: CDF only processes **actual changes** per table, while non-CDF processes **all data** across all tables.
+# MAGIC
+# MAGIC #### **Non-CDF Multi-Table Approach (Inefficient)**:
+# MAGIC - 📊 **Processes**: Entire tables every time
+# MAGIC - 💰 **Cost**: Very High - reprocesses unchanged data across all tables
+# MAGIC - ⏱️ **Time**: Slow - scans all records in all tables
+# MAGIC - 🔄 **Example**: If you have 5 tables with 1M records each, processes all 5M even for 1 change in 1 table
+# MAGIC
+# MAGIC #### **CDF Multi-Table Approach (Efficient)**:
+# MAGIC - 📊 **Processes**: Only changed records per table
+# MAGIC - 💰 **Cost**: Low - only pays for actual changes per table
+# MAGIC - ⏱️ **Time**: Fast - processes only deltas per table
+# MAGIC - 🔄 **Example**: If you have 5 tables with 1M records each but only 1 table has 5 changes, processes only 5 records
+# MAGIC
+# MAGIC **💡 Multi-Table CDF Benefits**: Up to 99.9%+ reduction in processing volume for incremental changes across multiple tables!
+# MAGIC
+# MAGIC ### 3.2 Silver Layer with MERGE Operations 
 # MAGIC
 
 # COMMAND ----------
 
 # Stream incrementally loading new data from the bronze CDC table and merging them in the Silver table
+# This function demonstrates CDF efficiency by processing only changed records per table
 def update_silver_layer(bronze_table, silver_table):
-  print(f"Ingesting {bronze_table} updates and materializing silver layer using MERGE statement with serverless...")
+  print(f"🔄 Processing {bronze_table} updates with CDF efficiency...")
+  
+  # Get total records in bronze table to show processing volume
+  try:
+    total_bronze_records = spark.sql(f"SELECT COUNT(*) as count FROM {bronze_table}").collect()[0]['count']
+    print(f"   📊 Total records in {bronze_table}: {total_bronze_records:,}")
+  except:
+    total_bronze_records = 0
+    print(f"   📊 Total records in {bronze_table}: {total_bronze_records:,}")
+  
   # First create the silver table if it doesn't exist with optimized properties:
   if not spark.catalog.tableExists(silver_table):
-    print(f"Table {silver_table} doesn't exist, creating it with optimized properties...")
+    print(f"   🏗️ Creating {silver_table} with optimized properties...")
     # Create table with sample schema and then optimize properties
     spark.read.table(bronze_table).drop("operation", "operation_date", "_rescued_data", "file_name").write.saveAsTable(silver_table)
     # Add optimized properties for serverless and performance
     spark.sql(f"""
       ALTER TABLE {silver_table} SET TBLPROPERTIES (
+        delta.enableChangeDataFeed = true,
         delta.autoOptimize.optimizeWrite = true, 
         delta.autoOptimize.autoCompact = true,
         delta.targetFileSize = '128MB',
         delta.tuneFileSizesForRewrites = true
       )
     """)
 
+  # Process only new records since last checkpoint (CDF efficiency)
+  print(f"   🔄 Processing only new records from {bronze_table}...")
+
   #for each batch / incremental update from the raw cdc table, we'll run a MERGE on the silver table
   def merge_stream(updates, i):
+    records_in_batch = updates.count()
+    print(f"   📊 Batch {i}: Processing {records_in_batch:,} records")
+    
+    if records_in_batch > 0 and total_bronze_records > 0:
+      # Show processing efficiency
+      efficiency = ((total_bronze_records - records_in_batch) / total_bronze_records * 100)
+      print(f"   💰 Processing efficiency: {efficiency:.1f}% reduction vs full table scan")
+      print(f"   ⚡ Speed improvement: {total_bronze_records / max(records_in_batch, 1):.1f}x faster")
+    
     #First we need to deduplicate based on the id and take the most recent update
     windowSpec = Window.partitionBy("id").orderBy(col("operation_date").desc())
     #Select only the first value 
@@ -353,19 +394,38 @@ def merge_stream(updates, i):
         .whenNotMatchedInsert("updates.operation != 'DELETE'", values=columns_to_update) \
         .execute()
     
-  print(f"Processing new CDC records for {silver_table}...")
+    print(f"   ✅ Batch {i} completed - processed {records_in_batch:,} records efficiently")
+    
+  print(f"🚀 Starting {silver_table} processing with CDF efficiency...")
   (spark.readStream
          .table(bronze_table)
        .writeStream
          .foreachBatch(merge_stream)
          .option("checkpointLocation", f"{raw_data_location}/cdc_full/checkpoints/{silver_table}")
          .option("mergeSchema", "true")  # Enable schema evolution for silver layer
          .trigger(availableNow=True)  # Process only new data since last checkpoint
-         .start().awaitTermination())
+          .start().awaitTermination())
 
 # COMMAND ----------
 
-# MAGIC %md ### 3.2 Starting all the streams
+# MAGIC %md 
+# MAGIC ### 3.3 Multi-Table CDF Processing Volume Summary
+# MAGIC
+# MAGIC **🎯 What We Just Demonstrated**:
+# MAGIC - **CDF Processing**: Only processed actual changes per table
+# MAGIC - **Volume Efficiency**: Dramatically reduced processing volume across multiple tables
+# MAGIC - **Cost Savings**: Significant reduction in compute costs per table
+# MAGIC - **Performance**: Much faster processing times per table
+# MAGIC
+# MAGIC **📊 Key Metrics Per Table**:
+# MAGIC - **Total Bronze Records**: Shows full table size per table
+# MAGIC - **CDF Records Processed**: Shows only changed records per table
+# MAGIC - **Efficiency Gain**: Percentage reduction in processing volume per table
+# MAGIC - **Speed Improvement**: Multiplier for processing speed per table
+# MAGIC
+# MAGIC **💡 Multi-Table Impact**: In production, this can mean processing 1,000 records across 5 tables instead of 5,000,000 records for incremental updates!
+# MAGIC
+# MAGIC ### 3.4 Starting all the streams
 # MAGIC
 # MAGIC We can now iterate over the folders to start the bronze & silver streams for each table.