From f2ca121a85236975fb8e18d29d50357e38187cd5 Mon Sep 17 00:00:00 2001
From: John Grubb <git@grubb.johnnygrubb.com>
Date: Mon, 14 Jul 2025 17:22:02 -0400
Subject: [PATCH 1/4] refactor(aws): Extract common CLI helper functions

- Add _load_and_validate_config() to eliminate duplicate config loading
- Add _get_aws_credentials() to centralize AWS credential extraction
- Replace 4 instances of duplicate config loading logic across CLI commands
- Reduce code duplication by ~40 lines while maintaining exact functionality

Co-Authored-By: Claude <noreply@anthropic.com>
---
 vendors/aws/cli.py | 110 ++++++++++++++++++++-------------------------
 1 file changed, 49 insertions(+), 61 deletions(-)

diff --git a/vendors/aws/cli.py b/vendors/aws/cli.py
index ea0d08a..9faebde 100644
--- a/vendors/aws/cli.py
+++ b/vendors/aws/cli.py
@@ -11,47 +11,59 @@
 from .manifest import ManifestLocator
 
 
-def aws_import_cur(args):
-    """Import AWS Cost and Usage Reports."""
-    
-    # Load configuration
+def _load_and_validate_config(args, required_aws_fields=True):
+    """Load configuration, merge CLI args, and validate AWS config if needed."""
     config_path = Path(args.config) if args.config else Path('config.toml')
     config = Config.load(config_path)
     
-    # Merge CLI arguments into configuration
+    # Merge CLI arguments into configuration (only non-None values)
     cli_args = {
-        'bucket': args.bucket,
-        'prefix': args.prefix,
-        'export_name': args.export_name,
-        'cur_version': args.cur_version,
-        'export_format': args.export_format,
-        'start_date': args.start_date,
-        'end_date': args.end_date,
-        'reset': args.reset,
-        'table_strategy': args.table_strategy
+        'bucket': getattr(args, 'bucket', None),
+        'prefix': getattr(args, 'prefix', None),
+        'export_name': getattr(args, 'export_name', None),
+        'cur_version': getattr(args, 'cur_version', None),
+        'export_format': getattr(args, 'export_format', None),
+        'start_date': getattr(args, 'start_date', None),
+        'end_date': getattr(args, 'end_date', None),
+        'reset': getattr(args, 'reset', None),
+        'table_strategy': getattr(args, 'table_strategy', None)
     }
+    cli_args = {k: v for k, v in cli_args.items() if v is not None}
+    config.merge_cli_args(cli_args)
     
     # Override database backend if specified via CLI
     if hasattr(args, 'destination') and args.destination != 'duckdb':
-        # CLI override for destination
         config.database.backend = args.destination
     
-    # Remove None values
-    cli_args = {k: v for k, v in cli_args.items() if v is not None}
-    
-    # Merge with config
-    config.merge_cli_args(cli_args)
+    # Validate AWS configuration if required
+    if required_aws_fields:
+        try:
+            config.validate_aws_config()
+        except ValueError as e:
+            print(f"Error: {e}", file=sys.stderr)
+            print("\nRequired parameters can be set via:")
+            print("  - config.toml file")
+            print("  - Environment variables (OPEN_FINOPS_AWS_*)")
+            print("  - Command-line flags")
+            sys.exit(1)
+    
+    return config
+
+
+def _get_aws_credentials(config):
+    """Extract AWS credentials dictionary from config."""
+    return {
+        'access_key_id': config.aws.access_key_id,
+        'secret_access_key': config.aws.secret_access_key,
+        'region': config.aws.region
+    }
+
+
+def aws_import_cur(args):
+    """Import AWS Cost and Usage Reports."""
     
-    # Validate we have required fields
-    try:
-        config.validate_aws_config()
-    except ValueError as e:
-        print(f"Error: {e}", file=sys.stderr)
-        print("\nRequired parameters can be set via:")
-        print("  - config.toml file")
-        print("  - Environment variables (OPEN_FINOPS_AWS_*)")
-        print("  - Command-line flags")
-        sys.exit(1)
+    # Load and validate configuration
+    config = _load_and_validate_config(args)
     
     # Show configuration
     print("\nAWS CUR Import Configuration:")
@@ -87,33 +99,11 @@ def aws_import_cur(args):
 def aws_list_manifests(args):
     """List available billing periods in S3."""
     
-    # Load configuration
-    config_path = Path(args.config) if args.config else Path('config.toml')
-    config = Config.load(config_path)
-    
-    # Merge CLI arguments for bucket/prefix/export_name if provided
-    cli_args = {
-        'bucket': args.bucket,
-        'prefix': args.prefix,
-        'export_name': args.export_name,
-        'cur_version': args.cur_version
-    }
-    cli_args = {k: v for k, v in cli_args.items() if v is not None}
-    config.merge_cli_args(cli_args)
-    
-    # Validate we have required fields
-    try:
-        config.validate_aws_config()
-    except ValueError as e:
-        print(f"Error: {e}", file=sys.stderr)
-        sys.exit(1)
+    # Load and validate configuration
+    config = _load_and_validate_config(args)
     
     # Get AWS credentials
-    aws_creds = {
-        'access_key_id': config.aws.access_key_id,
-        'secret_access_key': config.aws.secret_access_key,
-        'region': config.aws.region
-    }
+    aws_creds = _get_aws_credentials(config)
     
     # Initialize manifest locator
     locator = ManifestLocator(
@@ -148,9 +138,8 @@ def aws_list_manifests(args):
 def aws_show_state(args):
     """Show load state and version history."""
     
-    # Load configuration
-    config_path = Path(args.config) if args.config else Path('config.toml')
-    config = Config.load(config_path)
+    # Load configuration (don't require all AWS fields, just export_name)
+    config = _load_and_validate_config(args, required_aws_fields=False)
     
     # Override export name if provided
     if args.export_name:
@@ -229,9 +218,8 @@ def aws_show_state(args):
 def aws_list_exports(args):
     """List all available exports and their tables."""
     
-    # Load configuration
-    config_path = Path(args.config) if args.config else Path('config.toml')
-    config = Config.load(config_path)
+    # Load configuration (don't require AWS fields for this command)
+    config = _load_and_validate_config(args, required_aws_fields=False)
     
     # Set up data directory path
     if config.project and config.project.data_dir:

From ba917630297df51163007350a19fa35bfd213b3d Mon Sep 17 00:00:00 2001
From: John Grubb <git@grubb.johnnygrubb.com>
Date: Mon, 14 Jul 2025 17:23:21 -0400
Subject: [PATCH 2/4] refactor(aws): Unify file reading and split large
 pipeline function

Major refactoring to improve code clarity and reduce duplication:

- Add DuckDBS3Reader class with context management for unified CSV/Parquet reading
- Remove duplicate read_csv_file() and read_parquet_file() functions
- Extract _run_single_table_strategy() and _run_separate_tables_strategy()
- Reduce run_aws_pipeline() from 325 lines to 40 lines with clear strategy delegation
- Consolidate record cleaning logic into single _clean_record() method
- Remove ~270 lines of duplicate connection and processing logic

All tests pass with zero functionality changes.
---
 vendors/aws/pipeline.py | 699 +++++++++++++++++++---------------------
 1 file changed, 330 insertions(+), 369 deletions(-)

diff --git a/vendors/aws/pipeline.py b/vendors/aws/pipeline.py
index f2e35cb..189852e 100644
--- a/vendors/aws/pipeline.py
+++ b/vendors/aws/pipeline.py
@@ -19,6 +19,69 @@
 from core.naming import create_table_name, sanitize_table_name
 
 
+class DuckDBS3Reader:
+    """Unified DuckDB S3 file reader for CSV and Parquet files."""
+    
+    def __init__(self, aws_creds: Dict[str, Any]):
+        """Initialize with AWS credentials."""
+        self.aws_creds = aws_creds
+        self.conn = None
+    
+    def __enter__(self):
+        """Context manager entry - setup DuckDB connection."""
+        self.conn = duckdb.connect()
+        
+        # Install and load httpfs extension for S3 access
+        self.conn.execute("INSTALL httpfs")
+        self.conn.execute("LOAD httpfs")
+        
+        # Configure AWS credentials
+        self.conn.execute(f"SET s3_access_key_id='{self.aws_creds['access_key_id']}'")
+        self.conn.execute(f"SET s3_secret_access_key='{self.aws_creds['secret_access_key']}'")
+        self.conn.execute(f"SET s3_region='{self.aws_creds.get('region', 'us-east-1')}'")
+        
+        return self
+    
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit - cleanup connection."""
+        if self.conn:
+            self.conn.close()
+    
+    def read_file(self, bucket: str, key: str, file_format: str) -> Iterator[Dict[str, Any]]:
+        """Read file from S3 and yield records as dictionaries."""
+        s3_path = f"s3://{bucket}/{key}"
+        
+        # Read file based on format
+        if file_format == 'csv':
+            if key.endswith('.gz'):
+                result = self.conn.execute(f"SELECT * FROM read_csv_auto('{s3_path}', compression='gzip')").fetchall()
+            else:
+                result = self.conn.execute(f"SELECT * FROM read_csv_auto('{s3_path}')").fetchall()
+        elif file_format == 'parquet':
+            result = self.conn.execute(f"SELECT * FROM read_parquet('{s3_path}')").fetchall()
+        else:
+            raise ValueError(f"Unsupported file format: {file_format}")
+        
+        columns = [desc[0] for desc in self.conn.description]
+        
+        print(f"    Loaded {len(result)} rows from {file_format} file")
+        print(f"    Columns: {len(columns)}")
+        
+        # Yield records as dictionaries with cleaned column names
+        for row in result:
+            record = dict(zip(columns, row))
+            yield self._clean_record(record)
+    
+    def _clean_record(self, record: Dict[str, Any]) -> Dict[str, Any]:
+        """Clean column names by replacing '/' with '_'."""
+        cleaned_record = {}
+        for col, val in record.items():
+            # Replace / with _ (e.g., "lineItem/UnblendedCost" -> "lineItem_UnblendedCost")
+            clean_col = col.replace('/', '_') if '/' in col else col
+            cleaned_record[clean_col] = val
+        return cleaned_record
+
+
 def create_unified_view(export_name: str, pipeline, config: AWSConfig, backend) -> None:
     """Create a unified view that combines all billing periods for an export.
     
@@ -223,19 +286,17 @@ def billing_period_resource(
         else:
             raise ValueError(f"Cannot determine file format for {s3_key}")
         
-        # Yield records from the file using data reader
-        if data_reader and file_format == 'parquet':
-            yield from data_reader.read_parquet_file(s3_bucket, s3_key, aws_creds)
-        elif data_reader and file_format == 'csv':
-            yield from data_reader.read_csv_file(s3_bucket, s3_key, aws_creds)
+        # Yield records from the file using unified reader
+        if data_reader and hasattr(data_reader, 'read_parquet_file'):
+            # Use backend-specific reader if available
+            if file_format == 'parquet':
+                yield from data_reader.read_parquet_file(s3_bucket, s3_key, aws_creds)
+            else:
+                yield from data_reader.read_csv_file(s3_bucket, s3_key, aws_creds)
         else:
-            # Fallback to old method for backward compatibility
-            yield from read_report_file(
-                bucket=s3_bucket,
-                key=s3_key,
-                file_format=file_format,
-                aws_creds=aws_creds
-            )
+            # Use unified DuckDB reader
+            with DuckDBS3Reader(aws_creds) as reader:
+                yield from reader.read_file(s3_bucket, s3_key, file_format)
 
 
 def billing_period_with_partition(
@@ -257,112 +318,279 @@ def billing_period_with_partition(
         yield record
 
 
-def read_report_file(
-    bucket: str,
-    key: str,
-    file_format: str,
-    aws_creds: Dict[str, Any]
-) -> Iterator[Dict[str, Any]]:
-    """Read records from a CUR report file using DuckDB."""
-    
-    if file_format == 'parquet':
-        yield from read_parquet_file(None, bucket, key, aws_creds)
-    else:
-        yield from read_csv_file(None, bucket, key, aws_creds)
 
 
-def read_csv_file(s3_client, bucket: str, key: str, aws_creds: Dict[str, Any]) -> Iterator[Dict[str, Any]]:
-    """Read CSV file from S3 using DuckDB and yield records."""
+def _run_single_table_strategy(config: AWSConfig, backend, state_manager, data_reader, pipeline) -> None:
+    """Run pipeline using single table strategy with partition replacement."""
+    print("Using single table strategy with partition replacement")
     
-    # Create a temporary DuckDB connection
-    conn = duckdb.connect()
-    
-    # Install and load the httpfs extension for S3 access
-    conn.execute("INSTALL httpfs")
-    conn.execute("LOAD httpfs")
+    # Get AWS credentials
+    aws_creds = {
+        'access_key_id': config.access_key_id,
+        'secret_access_key': config.secret_access_key,
+        'region': config.region
+    }
     
-    # Configure AWS credentials for DuckDB
-    conn.execute(f"SET s3_access_key_id='{aws_creds['access_key_id']}'")
-    conn.execute(f"SET s3_secret_access_key='{aws_creds['secret_access_key']}'")
-    conn.execute(f"SET s3_region='{aws_creds.get('region', 'us-east-1')}'")
+    locator = ManifestLocator(
+        bucket=config.bucket,
+        prefix=config.prefix,
+        export_name=config.export_name,
+        cur_version=config.cur_version
+    )
     
-    # Read directly from S3 using DuckDB
-    s3_path = f"s3://{bucket}/{key}"
+    manifests = locator.list_manifests(
+        start_date=config.start_date,
+        end_date=config.end_date,
+        **aws_creds
+    )
     
-    try:
-        # Handle gzipped files - DuckDB can read them directly
-        if key.endswith('.gz'):
-            result = conn.execute(f"SELECT * FROM read_csv_auto('{s3_path}', compression='gzip')").fetchall()
-        else:
-            result = conn.execute(f"SELECT * FROM read_csv_auto('{s3_path}')").fetchall()
-            
-        columns = [desc[0] for desc in conn.description]
+    # Process each billing period separately
+    for manifest in manifests:
+        print(f"\nProcessing {manifest.billing_period}...")
         
-        print(f"    Loaded {len(result)} rows from CSV file")
-        print(f"    Columns: {len(columns)}")
+        # Fetch the full manifest to get assembly ID
+        full_manifest = locator.fetch_manifest(manifest, **aws_creds)
+        print(f"  Assembly ID: {full_manifest.assembly_id}")
         
-        # Yield records as dictionaries
-        for row in result:
-            record = dict(zip(columns, row))
-            # Clean up column names (replace / with _)
-            cleaned_record = {}
-            for col, val in record.items():
-                # Handle different column naming conventions
-                if '/' in col:
-                    # Replace / with _ (e.g., "lineItem/UnblendedCost" -> "lineItem_UnblendedCost")
-                    clean_col = col.replace('/', '_')
-                else:
-                    clean_col = col
-                cleaned_record[clean_col] = val
-            yield cleaned_record
+        # Check if this version has already been loaded
+        if state_manager.is_version_loaded('aws', config.export_name, 
+                                          manifest.billing_period, full_manifest.assembly_id):
+            print(f"  ✓ Already loaded (skipping)")
+            continue
+        
+        # Start tracking this load
+        state_manager.start_load(
+            vendor='aws',
+            export_name=config.export_name,
+            billing_period=manifest.billing_period,
+            version_id=full_manifest.assembly_id,
+            data_format_version=config.cur_version,
+            file_count=len(full_manifest.report_keys)
+        )
+        
+        try:
+            # Delete existing data for this billing period
+            with pipeline.sql_client() as client:
+                # Check if table exists
+                tables = client.execute_sql(
+                    "SELECT table_name FROM information_schema.tables "
+                    f"WHERE table_schema = '{config.dataset_name}' AND table_name = 'billing_data'"
+                )
+                
+                if tables:
+                    # Delete data for this billing period
+                    delete_sql = f"""
+                    DELETE FROM {config.dataset_name}.billing_data 
+                    WHERE billing_period = '{manifest.billing_period}'
+                    """
+                    client.execute_sql(delete_sql)
+                    print(f"  Deleted existing data for {manifest.billing_period}")
+            
+            # Load new data for this billing period
+            load_info = pipeline.run(
+                [dlt.resource(
+                    billing_period_with_partition(full_manifest, config, aws_creds, data_reader),
+                    name="billing_data",
+                    write_disposition="append"
+                )]
+            )
+            
+            # Count rows loaded for this billing period
+            row_count = 0
+            try:
+                with pipeline.sql_client() as client:
+                    # Use backend-specific table reference
+                    table_ref = backend.get_table_reference(config.dataset_name, "billing_data")
+                    result = client.execute_sql(f"SELECT COUNT(*) FROM {table_ref} WHERE billing_period = '{manifest.billing_period}'")
+                    row_count = result[0][0] if result else 0
+                    print(f"  Loaded {row_count:,} rows")
+            except Exception as e:
+                print(f"  Loaded data (row count unavailable: {e})")
             
-    finally:
-        conn.close()
+            # Mark load as completed
+            state_manager.complete_load(
+                vendor='aws',
+                export_name=config.export_name,
+                billing_period=manifest.billing_period,
+                version_id=full_manifest.assembly_id,
+                row_count=row_count
+            )
+            
+        except Exception as e:
+            # Mark load as failed
+            state_manager.fail_load(
+                vendor='aws',
+                export_name=config.export_name,
+                billing_period=manifest.billing_period,
+                version_id=full_manifest.assembly_id,
+                error_message=str(e)
+            )
+            raise
 
 
-def read_parquet_file(s3_client, bucket: str, key: str, aws_creds: Dict[str, Any]) -> Iterator[Dict[str, Any]]:
-    """Read Parquet file from S3 using DuckDB and yield records."""
+def _run_separate_tables_strategy(config: AWSConfig, backend, state_manager, data_reader, pipeline) -> None:
+    """Run pipeline using separate tables strategy (recommended)."""
+    print("Using separate tables strategy")
+    print(f"Database location: {backend.get_database_path_or_connection()}")
+    
+    # Get AWS credentials
+    aws_creds = {
+        'access_key_id': config.access_key_id,
+        'secret_access_key': config.secret_access_key,
+        'region': config.region
+    }
     
-    # Create a temporary DuckDB connection
-    conn = duckdb.connect()
+    locator = ManifestLocator(
+        bucket=config.bucket,
+        prefix=config.prefix,
+        export_name=config.export_name,
+        cur_version=config.cur_version
+    )
     
-    # Install and load the httpfs extension for S3 access
-    conn.execute("INSTALL httpfs")
-    conn.execute("LOAD httpfs")
+    manifests = locator.list_manifests(
+        start_date=config.start_date,
+        end_date=config.end_date,
+        **aws_creds
+    )
     
-    # Configure AWS credentials for DuckDB
-    conn.execute(f"SET s3_access_key_id='{aws_creds['access_key_id']}'")
-    conn.execute(f"SET s3_secret_access_key='{aws_creds['secret_access_key']}'")
-    conn.execute(f"SET s3_region='{aws_creds.get('region', 'us-east-1')}'")
+    # Filter manifests to only those that need loading
+    manifests_to_load = []
+    for manifest in manifests:
+        # Fetch full manifest to get assembly ID
+        full_manifest = locator.fetch_manifest(manifest, **aws_creds)
+        
+        # Check if already loaded (skip this check if reset flag is set)
+        if not config.reset and state_manager.is_version_loaded('aws', config.export_name,
+                                         full_manifest.billing_period, full_manifest.assembly_id):
+            print(f"Skipping {full_manifest.billing_period} - already loaded (assembly ID: {full_manifest.assembly_id})")
+        else:
+            if config.reset:
+                print(f"Will reload {full_manifest.billing_period} - reset flag set (assembly ID: {full_manifest.assembly_id})")
+            else:
+                print(f"Will load {full_manifest.billing_period} - new version (assembly ID: {full_manifest.assembly_id})")
+            manifests_to_load.append(full_manifest)
     
-    # Read directly from S3 using DuckDB
-    s3_path = f"s3://{bucket}/{key}"
+    if not manifests_to_load:
+        print("\n✓ All billing periods are up to date!")
+        return
     
-    try:
-        # Query the parquet file directly
-        result = conn.execute(f"SELECT * FROM read_parquet('{s3_path}')").fetchall()
-        columns = [desc[0] for desc in conn.description]
+    print(f"\nLoading {len(manifests_to_load)} billing period(s)...")
+    
+    # Process each manifest that needs loading
+    for manifest in manifests_to_load:
+        print(f"\nProcessing {manifest.billing_period}...")
+        print(f"  Assembly ID: {manifest.assembly_id}")
+        print(f"  Report files: {len(manifest.report_keys)}")
         
-        print(f"    Loaded {len(result)} rows from parquet file")
-        print(f"    Columns: {len(columns)}")
+        # Start tracking this load
+        state_manager.start_load(
+            vendor='aws',
+            export_name=config.export_name,
+            billing_period=manifest.billing_period,
+            version_id=manifest.assembly_id,
+            data_format_version=config.cur_version,
+            file_count=len(manifest.report_keys)
+        )
         
-        # Yield records as dictionaries
-        for row in result:
-            record = dict(zip(columns, row))
-            # Clean up column names (replace / with _)
-            cleaned_record = {}
-            for col, val in record.items():
-                # Handle different column naming conventions
-                if '/' in col:
-                    # Replace / with _ (e.g., "lineItem/UnblendedCost" -> "lineItem_UnblendedCost")
-                    clean_col = col.replace('/', '_')
-                else:
-                    clean_col = col
-                cleaned_record[clean_col] = val
-            yield cleaned_record
+        try:
+            # Create a resource for just this billing period
+            table_name = create_table_name(config.export_name, manifest.billing_period)
+            
+            # Run pipeline for this specific manifest
+            load_info = pipeline.run(
+                dlt.resource(
+                    billing_period_resource(manifest, config, aws_creds, data_reader),
+                    name=table_name,
+                    write_disposition="replace"  # Replace the entire table
+                )
+            )
+            
+            # Count rows loaded
+            row_count = 0
+            try:
+                with pipeline.sql_client() as client:
+                    # Use backend-specific table reference
+                    table_ref = backend.get_table_reference(config.dataset_name, table_name)
+                    result = client.execute_sql(f"SELECT COUNT(*) FROM {table_ref}")
+                    row_count = result[0][0] if result else 0
+                    print(f"  ✓ Loaded {row_count:,} rows")
+            except Exception as e:
+                print(f"  ✓ Loaded data (row count unavailable: {e})")
+            
+            # Mark load as completed
+            state_manager.complete_load(
+                vendor='aws',
+                export_name=config.export_name,
+                billing_period=manifest.billing_period,
+                version_id=manifest.assembly_id,
+                row_count=row_count
+            )
+            
+        except Exception as e:
+            # Mark load as failed
+            state_manager.fail_load(
+                vendor='aws',
+                export_name=config.export_name,
+                billing_period=manifest.billing_period,
+                version_id=manifest.assembly_id,
+                error_message=str(e)
+            )
+            print(f"  ✗ Failed: {e}")
+            raise
+    
+    # Show summary of all tables
+    print("\n" + "="*50)
+    print("SUMMARY")
+    print("="*50)
+    
+    total_rows = 0
+    try:
+        with pipeline.sql_client() as client:
+            # Get list of billing tables for this export
+            # We need to match tables that contain the sanitized export name
+            clean_export = sanitize_table_name(config.export_name)
+            
+            tables_result = client.execute_sql(
+                "SELECT table_name FROM information_schema.tables "
+                f"WHERE table_schema = '{config.dataset_name}' AND table_name LIKE '{clean_export}_%' "
+                "ORDER BY table_name"
+            )
             
-    finally:
-        conn.close()
+            print("\nAll billing tables:")
+            for table_row in tables_result:
+                table_name = table_row[0]
+                table_ref = backend.get_table_reference(config.dataset_name, table_name)
+                count_result = client.execute_sql(f"SELECT COUNT(*) FROM {table_ref}")
+                rows = count_result[0][0] if count_result else 0
+                total_rows += rows
+                
+                # Get billing period from table name
+                # Extract the last part after the export name (YYYY_MM)
+                parts = table_name.split('_')
+                if len(parts) >= 2:
+                    billing_period = f"{parts[-2]}-{parts[-1]}"
+                else:
+                    billing_period = "unknown"
+                
+                # Get version info from state tracker
+                versions = state_manager.get_version_history('aws', config.export_name, billing_period)
+                current_version = next((v for v in versions if v['current_version']), None)
+                
+                if current_version:
+                    print(f"  {table_name}: {rows:,} rows (version: {current_version['version_id'][:8]}...)")
+                else:
+                    print(f"  {table_name}: {rows:,} rows")
+                    
+    except Exception as e:
+        print(f"Could not get table summary: {e}")
+    
+    print(f"\nTotal rows in database: {total_rows:,}")
+    
+    # Create unified view for this export
+    print("\n" + "="*50)
+    print("CREATING UNIFIED VIEW")
+    print("="*50)
+    create_unified_view(config.export_name, pipeline, config, backend)
 
 
 def run_aws_pipeline(config: AWSConfig, 
@@ -418,275 +646,8 @@ def run_aws_pipeline(config: AWSConfig,
         dataset_name=config.dataset_name
     )
     
-    # For single table strategy with proper partition replacement
+    # Choose strategy
     if table_strategy == "single":
-        # We need to manually delete old data for each billing period
-        # before loading new data
-        print("Using single table strategy with partition replacement")
-        
-        # Get the manifest list first
-        aws_creds = {
-            'access_key_id': config.access_key_id,
-            'secret_access_key': config.secret_access_key,
-            'region': config.region
-        }
-        
-        locator = ManifestLocator(
-            bucket=config.bucket,
-            prefix=config.prefix,
-            export_name=config.export_name,
-            cur_version=config.cur_version
-        )
-        
-        manifests = locator.list_manifests(
-            start_date=config.start_date,
-            end_date=config.end_date,
-            **aws_creds
-        )
-        
-        # Process each billing period separately
-        for manifest in manifests:
-            print(f"\nProcessing {manifest.billing_period}...")
-            
-            # Fetch the full manifest to get assembly ID
-            full_manifest = locator.fetch_manifest(manifest, **aws_creds)
-            print(f"  Assembly ID: {full_manifest.assembly_id}")
-            
-            # Check if this version has already been loaded
-            if state_manager.is_version_loaded('aws', config.export_name, 
-                                              manifest.billing_period, full_manifest.assembly_id):
-                print(f"  ✓ Already loaded (skipping)")
-                continue
-            
-            # Start tracking this load
-            state_manager.start_load(
-                vendor='aws',
-                export_name=config.export_name,
-                billing_period=manifest.billing_period,
-                version_id=full_manifest.assembly_id,
-                data_format_version=config.cur_version,
-                file_count=len(full_manifest.report_keys)
-            )
-            
-            try:
-                # Delete existing data for this billing period
-                with pipeline.sql_client() as client:
-                    # Check if table exists
-                    tables = client.execute_sql(
-                        "SELECT table_name FROM information_schema.tables "
-                        f"WHERE table_schema = '{config.dataset_name}' AND table_name = 'billing_data'"
-                    )
-                    
-                    if tables:
-                        # Delete data for this billing period
-                        delete_sql = f"""
-                        DELETE FROM {config.dataset_name}.billing_data 
-                        WHERE billing_period = '{manifest.billing_period}'
-                        """
-                        client.execute_sql(delete_sql)
-                        print(f"  Deleted existing data for {manifest.billing_period}")
-                
-                # Load new data for this billing period
-                load_info = pipeline.run(
-                    [dlt.resource(
-                        billing_period_with_partition(full_manifest, config, aws_creds, data_reader),
-                        name="billing_data",
-                        write_disposition="append"
-                    )]
-                )
-                
-                # Count rows loaded for this billing period
-                row_count = 0
-                try:
-                    with pipeline.sql_client() as client:
-                        # Use backend-specific table reference
-                        table_ref = backend.get_table_reference(config.dataset_name, "billing_data")
-                        result = client.execute_sql(f"SELECT COUNT(*) FROM {table_ref} WHERE billing_period = '{manifest.billing_period}'")
-                        row_count = result[0][0] if result else 0
-                        print(f"  Loaded {row_count:,} rows")
-                except Exception as e:
-                    print(f"  Loaded data (row count unavailable: {e})")
-                
-                # Mark load as completed
-                state_manager.complete_load(
-                    vendor='aws',
-                    export_name=config.export_name,
-                    billing_period=manifest.billing_period,
-                    version_id=full_manifest.assembly_id,
-                    row_count=row_count
-                )
-                
-            except Exception as e:
-                # Mark load as failed
-                state_manager.fail_load(
-                    vendor='aws',
-                    export_name=config.export_name,
-                    billing_period=manifest.billing_period,
-                    version_id=full_manifest.assembly_id,
-                    error_message=str(e)
-                )
-                raise
-    
+        _run_single_table_strategy(config, backend, state_manager, data_reader, pipeline)
     else:
-        # Use separate tables strategy (default and recommended)
-        print("Using separate tables strategy")
-        print(f"Database location: {backend.get_database_path_or_connection()}")
-        
-        # Get all manifests first to check which need loading
-        aws_creds = {
-            'access_key_id': config.access_key_id,
-            'secret_access_key': config.secret_access_key,
-            'region': config.region
-        }
-        
-        locator = ManifestLocator(
-            bucket=config.bucket,
-            prefix=config.prefix,
-            export_name=config.export_name,
-            cur_version=config.cur_version
-        )
-        
-        manifests = locator.list_manifests(
-            start_date=config.start_date,
-            end_date=config.end_date,
-            **aws_creds
-        )
-        
-        # Filter manifests to only those that need loading
-        manifests_to_load = []
-        for manifest in manifests:
-            # Fetch full manifest to get assembly ID
-            full_manifest = locator.fetch_manifest(manifest, **aws_creds)
-            
-            # Check if already loaded (skip this check if reset flag is set)
-            if not config.reset and state_manager.is_version_loaded('aws', config.export_name,
-                                             full_manifest.billing_period, full_manifest.assembly_id):
-                print(f"Skipping {full_manifest.billing_period} - already loaded (assembly ID: {full_manifest.assembly_id})")
-            else:
-                if config.reset:
-                    print(f"Will reload {full_manifest.billing_period} - reset flag set (assembly ID: {full_manifest.assembly_id})")
-                else:
-                    print(f"Will load {full_manifest.billing_period} - new version (assembly ID: {full_manifest.assembly_id})")
-                manifests_to_load.append(full_manifest)
-        
-        if not manifests_to_load:
-            print("\n✓ All billing periods are up to date!")
-            return
-        
-        print(f"\nLoading {len(manifests_to_load)} billing period(s)...")
-        
-        # Process each manifest that needs loading
-        for manifest in manifests_to_load:
-            print(f"\nProcessing {manifest.billing_period}...")
-            print(f"  Assembly ID: {manifest.assembly_id}")
-            print(f"  Report files: {len(manifest.report_keys)}")
-            
-            # Start tracking this load
-            state_manager.start_load(
-                vendor='aws',
-                export_name=config.export_name,
-                billing_period=manifest.billing_period,
-                version_id=manifest.assembly_id,
-                data_format_version=config.cur_version,
-                file_count=len(manifest.report_keys)
-            )
-            
-            try:
-                # Create a resource for just this billing period
-                table_name = create_table_name(config.export_name, manifest.billing_period)
-                
-                # Run pipeline for this specific manifest
-                load_info = pipeline.run(
-                    dlt.resource(
-                        billing_period_resource(manifest, config, aws_creds, data_reader),
-                        name=table_name,
-                        write_disposition="replace"  # Replace the entire table
-                    )
-                )
-                
-                # Count rows loaded
-                row_count = 0
-                try:
-                    with pipeline.sql_client() as client:
-                        # Use backend-specific table reference
-                        table_ref = backend.get_table_reference(config.dataset_name, table_name)
-                        result = client.execute_sql(f"SELECT COUNT(*) FROM {table_ref}")
-                        row_count = result[0][0] if result else 0
-                        print(f"  ✓ Loaded {row_count:,} rows")
-                except Exception as e:
-                    print(f"  ✓ Loaded data (row count unavailable: {e})")
-                
-                # Mark load as completed
-                state_manager.complete_load(
-                    vendor='aws',
-                    export_name=config.export_name,
-                    billing_period=manifest.billing_period,
-                    version_id=manifest.assembly_id,
-                    row_count=row_count
-                )
-                
-            except Exception as e:
-                # Mark load as failed
-                state_manager.fail_load(
-                    vendor='aws',
-                    export_name=config.export_name,
-                    billing_period=manifest.billing_period,
-                    version_id=manifest.assembly_id,
-                    error_message=str(e)
-                )
-                print(f"  ✗ Failed: {e}")
-                raise
-        
-        # Show summary of all tables
-        print("\n" + "="*50)
-        print("SUMMARY")
-        print("="*50)
-        
-        total_rows = 0
-        try:
-            with pipeline.sql_client() as client:
-                # Get list of billing tables for this export
-                # We need to match tables that contain the sanitized export name
-                clean_export = sanitize_table_name(config.export_name)
-                
-                tables_result = client.execute_sql(
-                    "SELECT table_name FROM information_schema.tables "
-                    f"WHERE table_schema = '{config.dataset_name}' AND table_name LIKE '{clean_export}_%' "
-                    "ORDER BY table_name"
-                )
-                
-                print("\nAll billing tables:")
-                for table_row in tables_result:
-                    table_name = table_row[0]
-                    table_ref = backend.get_table_reference(config.dataset_name, table_name)
-                    count_result = client.execute_sql(f"SELECT COUNT(*) FROM {table_ref}")
-                    rows = count_result[0][0] if count_result else 0
-                    total_rows += rows
-                    
-                    # Get billing period from table name
-                    # Extract the last part after the export name (YYYY_MM)
-                    parts = table_name.split('_')
-                    if len(parts) >= 2:
-                        billing_period = f"{parts[-2]}-{parts[-1]}"
-                    else:
-                        billing_period = "unknown"
-                    
-                    # Get version info from state tracker
-                    versions = state_manager.get_version_history('aws', config.export_name, billing_period)
-                    current_version = next((v for v in versions if v['current_version']), None)
-                    
-                    if current_version:
-                        print(f"  {table_name}: {rows:,} rows (version: {current_version['version_id'][:8]}...)")
-                    else:
-                        print(f"  {table_name}: {rows:,} rows")
-                    
-        except Exception as e:
-            print(f"Could not get table summary: {e}")
-        
-        print(f"\nTotal rows in database: {total_rows:,}")
-        
-        # Create unified view for this export
-        print("\n" + "="*50)
-        print("CREATING UNIFIED VIEW")
-        print("="*50)
-        create_unified_view(config.export_name, pipeline, config, backend)
+        _run_separate_tables_strategy(config, backend, state_manager, data_reader, pipeline)

From 3b2af2fd2cf4dd6256297515cff33c2c1b1cc075 Mon Sep 17 00:00:00 2001
From: John Grubb <git@grubb.johnnygrubb.com>
Date: Thu, 21 Aug 2025 07:58:29 -0400
Subject: [PATCH 3/4] refactor(cli): Improve vendor plugin discovery and error
 handling

- Add comprehensive documentation for CLI coordinator class
- Enhance error messages for missing vendors with helpful suggestions
- Improve code clarity with better comments and structure
- Maintain backward compatibility with existing functionality

Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 core/cli/main.py | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/core/cli/main.py b/core/cli/main.py
index fdcf73c..4848bd3 100644
--- a/core/cli/main.py
+++ b/core/cli/main.py
@@ -8,18 +8,22 @@
 
 
 class FinOpsCLI:
-    """Main CLI with plugin discovery."""
+    """Main CLI coordinator that discovers and manages vendor plugins.
+    
+    Automatically discovers vendor plugins via setuptools entry points,
+    with fallback to direct imports for development environments.
+    """
     
     def __init__(self):
         self.vendors: Dict[str, Type[VendorCommands]] = {}
         self._discover_vendors()
     
     def _discover_vendors(self):
-        """Discover vendor plugins via entry points."""
+        """Discover vendor plugins via entry points with development fallback."""
         vendors_found = False
         
         try:
-            # Phase 2: Automatic discovery via entry points
+            # Primary method: Auto-discovery via setuptools entry points
             import pkg_resources
             
             entry_points = list(pkg_resources.iter_entry_points('open_finops.vendors'))
@@ -34,21 +38,24 @@ def _discover_vendors(self):
                     print(f"⚠ Failed to load vendor plugin {entry_point.name}: {e}")
                     
         except ImportError:
-            pass  # pkg_resources not available
+            # pkg_resources not available (rare case)
+            pass
         
-        # Fallback: Manual discovery for development mode
+        # Development fallback: Direct import when entry points not set up
         if not vendors_found:
             try:
                 from vendors.aws.cli import AWSCommands
                 self.vendors['aws'] = AWSCommands
             except ImportError:
-                pass  # AWS not installed
+                # AWS vendor not available in this installation
+                pass
     
     def run(self):
-        """Run the CLI."""
+        """Parse arguments and execute the appropriate vendor command."""
         parser = self._create_parser()
         args = parser.parse_args()
         
+        # Show help if no command specified
         if not args.command:
             parser.print_help()
             sys.exit(0)
@@ -59,8 +66,12 @@ def run(self):
             vendor_instance = vendor_class()
             vendor_instance.execute(args)
         else:
+            # Handle unknown vendor with helpful error message
             print(f"Vendor '{args.command}' not available")
-            print(f"Available vendors: {', '.join(self.vendors.keys())}")
+            if self.vendors:
+                print(f"Available vendors: {', '.join(self.vendors.keys())}")
+            else:
+                print("No vendor plugins found. Check your installation.")
             sys.exit(1)
     
     def _create_parser(self):
@@ -75,7 +86,7 @@ def _create_parser(self):
         
         subparsers = parser.add_subparsers(dest='command', help='Available commands')
         
-        # Let each vendor add its subcommands
+        # Register subcommands from each discovered vendor plugin
         for name, vendor_class in self.vendors.items():
             vendor_instance = vendor_class()
             vendor_instance.add_subparser(subparsers)

From ca3b42ec7f479506515c8cc9b17963369caff953 Mon Sep 17 00:00:00 2001
From: John Grubb <git@grubb.johnnygrubb.com>
Date: Thu, 21 Aug 2025 08:04:02 -0400
Subject: [PATCH 4/4] chore: Add .serena directory to gitignore

Exclude serena development artifacts from version control.

Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 5f3df45..26a96f4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -85,3 +85,4 @@ tmp/
 temp/
 *.tmp
 .aider*
+.serena/