.bumpversion.cfg

-Original file line number
+Diff line change
@@ -1,5 +1,5 @@
     [bumpversion]
-    current_version = 2.0.0
+    current_version = 2.0.1
     commit = True
     tag = True
     parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>(rc|dev))(?P<build>\d+))?
@@ Expand Down @@

focus_validator/config_objects/focus_to_duckdb_converter.py

Large diffs are not rendered by default.

focus_validator/config_objects/rule.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -97,6 +97,9 @@ def is_dynamic(self) -> bool: @@
         def is_composite(self) -> bool:
             return self.function == "Composite"
+        def is_optional(self) -> bool:
+            return self.validation_criteria.keyword in ("OPTIONAL", "MAY")
         # Optional metadata
         notes: Optional[str] = Field(None, alias="Notes")
@@ Expand Down @@

focus_validator/data_loaders/csv_data_loader.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -95,6 +95,80 @@ def _convert_pandas_to_polars_dtypes( @@
             return polars_dtypes
+        def _peek_for_all_null_columns(
+            self, filename_or_buffer, dtype_dict, peek_rows=5000
+        ):
+            """
+            Peek at first N rows to identify columns that are all NULL.
+            For these columns, we'll need to force the spec type to avoid misleading type errors.
+            Args:
+                filename_or_buffer: File path or buffer to peek into
+                dtype_dict: Dictionary of expected column types from spec
+                peek_rows: Number of rows to peek (default 5000)
+            Returns:
+                set: Column names that are all NULL in the peeked data
+            """
+            all_null_columns = set()
+            try:
+                # Peek at the first N rows
+                if isinstance(filename_or_buffer, str):
+                    # File path - can peek efficiently
+                    peek_df = pl.read_csv(
+                        filename_or_buffer,
+                        n_rows=peek_rows,
+                        infer_schema_length=peek_rows,
+                        null_values=[
+                            "INVALID",
+                            "INVALID_COST",
+                            "BAD_DATE",
+                            "INVALID_DECIMAL",
+                            "INVALID_INT",
+                            "NULL",
+                        ],
+                    )
+                else:
+                    # Buffer - need to read and reset (more expensive)
+                    initial_pos = (
+                        filename_or_buffer.tell()
+                        if hasattr(filename_or_buffer, "tell")
+                        else None
+                    )
+                    peek_df = pl.read_csv(
+                        filename_or_buffer,
+                        n_rows=peek_rows,
+                        infer_schema_length=peek_rows,
+                        null_values=[
+                            "INVALID",
+                            "INVALID_COST",
+                            "BAD_DATE",
+                            "INVALID_DECIMAL",
+                            "INVALID_INT",
+                            "NULL",
+                        ],
+                    )
+                    # Reset buffer position if possible
+                    if initial_pos is not None and hasattr(filename_or_buffer, "seek"):
+                        filename_or_buffer.seek(initial_pos)
+                # Check which columns from dtype_dict are all NULL in the peek
+                for col in dtype_dict.keys():
+                    if col in peek_df.columns:
+                        if peek_df[col].null_count() == len(peek_df):
+                            all_null_columns.add(col)
+                            self.log.info(
+                                f"Column '{col}' is all NULL in first {peek_rows} rows - "
+                                f"will force type from spec to avoid misleading type errors"
+                            )
+            except Exception as e:
+                self.log.warning(f"Failed to peek at data for all-NULL detection: {e}")
+                # If peek fails, continue without this optimization
+            return all_null_columns
         def _try_load_with_types(self, filename_or_buffer, dtype_dict, parse_dates_list):
             """
             Attempt to load CSV with specified types using Polars, with retry logic for problematic columns.
@@ Expand All @@
                 pl.DataFrame: Loaded DataFrame with types applied
             """
             try:
-                # Convert to Polars schema
-                polars_dtypes = self._convert_pandas_to_polars_dtypes(dtype_dict)
+                # First, peek to find all-NULL columns
+                all_null_columns = self._peek_for_all_null_columns(
+                    filename_or_buffer, dtype_dict
+                )
-                self.log.debug(f"Attempting to load with Polars dtypes: {polars_dtypes}")
+                # Convert to Polars schema - only for all-NULL columns to force their types
+                # For columns with data, let Polars infer so we can detect type mismatches
+                polars_dtypes_to_force = {}
+                if all_null_columns:
+                    forced_dtypes = {
+                        col: dtype
+                        for col, dtype in dtype_dict.items()
+                        if col in all_null_columns
+                    }
+                    polars_dtypes_to_force = self._convert_pandas_to_polars_dtypes(
+                        forced_dtypes
+                    )
+                    self.log.info(
+                        f"Forcing types for {len(polars_dtypes_to_force)} all-NULL columns: "
+                        f"{', '.join(sorted(polars_dtypes_to_force.keys()))}"
+                    )
+                self.log.debug(
+                    f"Attempting to load with forced Polars dtypes: {polars_dtypes_to_force}"
+                )
-                # Use schema_overrides instead of deprecated dtypes parameter
+                # Use schema_overrides only for all-NULL columns
+                # For other columns, let Polars infer from data so type checks can catch mismatches
                 df = pl.read_csv(
                     filename_or_buffer,
-                    schema_overrides=polars_dtypes,
+                    schema_overrides=(
+                        polars_dtypes_to_force if polars_dtypes_to_force else None
+                    ),
                     try_parse_dates=bool(parse_dates_list),
                     infer_schema_length=10000,  # Increased inference length
                     null_values=[
@@ Expand Down @@

focus_validator/outputter/outputter_web.py

Large diffs are not rendered by default.

2.0.1 release - fixes to validations #130

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

mike-finopsorg merged 12 commits into main from dev

Feb 5, 2026

-Original file line number
+Diff line change
@@ -1,5 +1,5 @@
     [bumpversion]
-    current_version = 2.0.0
+    current_version = 2.0.1
     commit = True
     tag = True
     parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>(rc|dev))(?P<build>\d+))?
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -97,6 +97,9 @@ def is_dynamic(self) -> bool: @@
         def is_composite(self) -> bool:
             return self.function == "Composite"
+        def is_optional(self) -> bool:
+            return self.validation_criteria.keyword in ("OPTIONAL", "MAY")
         # Optional metadata
         notes: Optional[str] = Field(None, alias="Notes")
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -95,6 +95,80 @@ def _convert_pandas_to_polars_dtypes( @@
             return polars_dtypes
+        def _peek_for_all_null_columns(
+            self, filename_or_buffer, dtype_dict, peek_rows=5000
+        ):
+            """
+            Peek at first N rows to identify columns that are all NULL.
+            For these columns, we'll need to force the spec type to avoid misleading type errors.
+            Args:
+                filename_or_buffer: File path or buffer to peek into
+                dtype_dict: Dictionary of expected column types from spec
+                peek_rows: Number of rows to peek (default 5000)
+            Returns:
+                set: Column names that are all NULL in the peeked data
+            """
+            all_null_columns = set()
+            try:
+                # Peek at the first N rows
+                if isinstance(filename_or_buffer, str):
+                    # File path - can peek efficiently
+                    peek_df = pl.read_csv(
+                        filename_or_buffer,
+                        n_rows=peek_rows,
+                        infer_schema_length=peek_rows,
+                        null_values=[
+                            "INVALID",
+                            "INVALID_COST",
+                            "BAD_DATE",
+                            "INVALID_DECIMAL",
+                            "INVALID_INT",
+                            "NULL",
+                        ],
+                    )
+                else:
+                    # Buffer - need to read and reset (more expensive)
+                    initial_pos = (
+                        filename_or_buffer.tell()
+                        if hasattr(filename_or_buffer, "tell")
+                        else None
+                    )
+                    peek_df = pl.read_csv(
+                        filename_or_buffer,
+                        n_rows=peek_rows,
+                        infer_schema_length=peek_rows,
+                        null_values=[
+                            "INVALID",
+                            "INVALID_COST",
+                            "BAD_DATE",
+                            "INVALID_DECIMAL",
+                            "INVALID_INT",
+                            "NULL",
+                        ],
+                    )
+                    # Reset buffer position if possible
+                    if initial_pos is not None and hasattr(filename_or_buffer, "seek"):
+                        filename_or_buffer.seek(initial_pos)
+                # Check which columns from dtype_dict are all NULL in the peek
+                for col in dtype_dict.keys():
+                    if col in peek_df.columns:
+                        if peek_df[col].null_count() == len(peek_df):
+                            all_null_columns.add(col)
+                            self.log.info(
+                                f"Column '{col}' is all NULL in first {peek_rows} rows - "
+                                f"will force type from spec to avoid misleading type errors"
+                            )
+            except Exception as e:
+                self.log.warning(f"Failed to peek at data for all-NULL detection: {e}")
+                # If peek fails, continue without this optimization
+            return all_null_columns
         def _try_load_with_types(self, filename_or_buffer, dtype_dict, parse_dates_list):
             """
             Attempt to load CSV with specified types using Polars, with retry logic for problematic columns.
@@ Expand All @@
                 pl.DataFrame: Loaded DataFrame with types applied
             """
             try:
-                # Convert to Polars schema
-                polars_dtypes = self._convert_pandas_to_polars_dtypes(dtype_dict)
+                # First, peek to find all-NULL columns
+                all_null_columns = self._peek_for_all_null_columns(
+                    filename_or_buffer, dtype_dict
+                )
-                self.log.debug(f"Attempting to load with Polars dtypes: {polars_dtypes}")
+                # Convert to Polars schema - only for all-NULL columns to force their types
+                # For columns with data, let Polars infer so we can detect type mismatches
+                polars_dtypes_to_force = {}
+                if all_null_columns:
+                    forced_dtypes = {
+                        col: dtype
+                        for col, dtype in dtype_dict.items()
+                        if col in all_null_columns
+                    }
+                    polars_dtypes_to_force = self._convert_pandas_to_polars_dtypes(
+                        forced_dtypes
+                    )
+                    self.log.info(
+                        f"Forcing types for {len(polars_dtypes_to_force)} all-NULL columns: "
+                        f"{', '.join(sorted(polars_dtypes_to_force.keys()))}"
+                    )
+                self.log.debug(
+                    f"Attempting to load with forced Polars dtypes: {polars_dtypes_to_force}"
+                )
-                # Use schema_overrides instead of deprecated dtypes parameter
+                # Use schema_overrides only for all-NULL columns
+                # For other columns, let Polars infer from data so type checks can catch mismatches
                 df = pl.read_csv(
                     filename_or_buffer,
-                    schema_overrides=polars_dtypes,
+                    schema_overrides=(
+                        polars_dtypes_to_force if polars_dtypes_to_force else None
+                    ),
                     try_parse_dates=bool(parse_dates_list),
                     infer_schema_length=10000,  # Increased inference length
                     null_values=[
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

2.0.1 release - fixes to validations #130

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

Uh oh!