diff --git a/src/nisarqa/processing/stats_h5_writer/metrics_writer.py b/src/nisarqa/processing/stats_h5_writer/metrics_writer.py index 3c92c804..cd1233be 100644 --- a/src/nisarqa/processing/stats_h5_writer/metrics_writer.py +++ b/src/nisarqa/processing/stats_h5_writer/metrics_writer.py @@ -592,6 +592,35 @@ def get_stats_name_descr(stat: str, component: str | None) -> tuple[str, str]: ) +def get_list_of_real_stats_names() -> list[str]: + """ + Return a list of the names of all statistics for real-valued datasets. + + These names are per NISAR conventions, for min/max/mean/std statistics. + """ + + stat_names = [] + for stat in ("min", "max", "mean", "std"): + s, _ = get_stats_name_descr(stat, component=None) + stat_names.append(s) + return stat_names + + +def get_list_of_imag_stats_names() -> list[str]: + """ + Return a list of the names of all statistics for complex-valued datasets. + + These names are per NISAR conventions, for min/max/mean/std statistics. + """ + + stat_names = [] + for comp in ("real", "imag"): + for stat in ("min", "max", "mean", "std"): + s, _ = get_stats_name_descr(stat, component=comp) + stat_names.append(s) + return stat_names + + def copy_non_insar_imagery_metrics( product: nisarqa.NonInsarProduct, stats_h5: h5py.File ) -> None: diff --git a/src/nisarqa/validate/sanity_checks.py b/src/nisarqa/validate/sanity_checks.py index f6d66901..06885f42 100644 --- a/src/nisarqa/validate/sanity_checks.py +++ b/src/nisarqa/validate/sanity_checks.py @@ -1,7 +1,9 @@ from __future__ import annotations import re -from collections.abc import Container +from collections.abc import Callable, Container +import copy +from typing import Any import h5py import numpy as np @@ -12,6 +14,42 @@ objects_to_skip = nisarqa.get_all(name=__name__) +def _log_if_bad_string_value(val: str | list[str], path: str) -> bool: + """ + Log an error if value is a known invalid string. + + Parameters + ---------- + val : str or list of str + Value to be checked. + path : str + Path to the dataset (and/or attribute) containing `val` to be used + for logging. If `val` is the value of an attribute, suggest + providing the dataset's path with the attribute name. + """ + log = nisarqa.get_logger() + + values = copy.deepcopy(val) + + if isinstance(values, str): + values = [values] + + for v in values: + if v.upper() in ( + "", + "0", + "['0']", + "['']", + "['' '' '' '' '']", + "NONE", + "(NOT SPECIFIED)", + ): + log.error( + f"Value is {val!r}, which is not valid for nominal NISAR data." + f" Path: {path}" + ) + + def dataset_sanity_checks(product: nisarqa.NisarProduct) -> None: """ Perform a series of verification checks on the input product's datasets. @@ -23,12 +61,210 @@ def dataset_sanity_checks(product: nisarqa.NisarProduct) -> None: """ with h5py.File(product.filepath, "r") as f: + check_metadata_conventions(h5_file=f) + identification_sanity_checks( id_group=f[product.identification_path], product_type=product.product_type, ) +def check_metadata_conventions(h5_file: h5py.File) -> None: + """ + Check that all datasets and attributes meet certain NISAR conventions. + + Iterate through an HDF5 file to validate that all groups and datasets, + including their attributes, meet certain NISAR conventions: + 1) populated (not empty) + 2) if string, that they are not variable-length strings and that + they are not populated with known placeholder values. + 3) if an attribute is numeric and is in given set of names, + that its dtype corresponds to its dataset's dtype. + + Any issues discovered are logged as errors. + + Parameters + ---------- + h5_file : h5py.File + The opened HDF5 file object to be inspected. + + Notes + ----- + This function is general for all NISAR product types. It does not have + special handling for specific datasets in specific products. + + This function does not compare dtypes against the dtypes denoted in the XML + product specifications. For that functionality, please use the XML Checker. + """ + log = nisarqa.get_logger() + + # Construct list of attributes whose dtypes should exactly-match the + # dtype of the dataset/group they are attached to. + exact_dtype_match = nisarqa.get_list_of_real_stats_names() + exact_dtype_match += ["_FillValue", "valid_min", "valid_max"] + + # Construct list of attributes whose dtypes should be half-precision of the + # dtype of the dataset/group they are attached to. + half_precision_match = nisarqa.get_list_of_imag_stats_names() + + def _validate_string_logic( + name: str, + dtype_: h5py.Datatype, + value_provider: Callable[[], Any], + label: str, + ) -> None: + """ + Unified logic to validate HDF5 string types and content. + + Parameters + ---------- + name : str + Path or name of the object. + dtype_ : h5py.Datatype + The HDF5 datatype to check. + value_provider : callable + A function/lambda that returns the actual value of when called. + Used to avoid reading data unless value has a string type. + label : str + Context label for logging (e.g., "Dataset" or "Attribute"). + """ + + string_info = h5py.check_string_dtype(dtype_) + if string_info is None: + # object is not a string dtype. (It could be int, float, etc.) + return + + if string_info.length is None: + log.error( + f"{label} is variable-length string; should be fixed-length." + f" Path: {name}" + ) + elif string_info.length == 0: + log.error(f"{label} is an empty string. Path: {name}") + else: + # ONLY for strings do we read the value to check content. + # Fixed-length strings are usually small metadata fields. + raw_val = value_provider() + ds_val = nisarqa.byte_string_to_python_str(raw_val) + + vals = [ds_val] if isinstance(ds_val, (str, bytes)) else ds_val + for val in vals: + _log_if_bad_string_value(val=val, path=name) + + def _check_attributes(item_name: str, item: h5py.HLObject) -> None: + """Check all attributes of a specific HDF5 object.""" + for attr_name, attr_val in item.attrs.items(): + if attr_val is None or isinstance(attr_val, h5py.Empty): + log.error( + f"Attribute '{attr_name}' is empty. Path: {item_name}" + ) + continue + + attr_id = item.attrs.get_id(attr_name) + + string_info = h5py.check_string_dtype(attr_id.dtype) + + if string_info is None: + # object is not a string dtype. (It could be int, float, etc.) + # Validate that that dtype of the attribute matches the dtype + # of the dataset. + + # Check attributes which should have an exact dtype match + if attr_name in exact_dtype_match: + if attr_id.dtype != item.dtype: + log.error( + f"Attribute has dtype {attr_id.dtype}, which does" + f" not match its dataset's dtype of {item.dtype}." + f" Path: {item_name} -> {attr_name}" + ) + + # Check attributes which should use a half-precision dtype + if attr_name in half_precision_match: + + incorrect_c32 = ( + nisarqa.is_complex32(item) + and attr_id.dtype != np.float16 + ) + incorrect_c64 = ( + item.dtype == np.complex64 + and attr_id.dtype != np.float32 + ) + if incorrect_c32 or incorrect_c64: + log.error( + f"Attribute has dtype {attr_id.dtype}, which does" + f" not match the half-precision of its dataset's" + f" dtype of {item.dtype}." + f" Path: {item_name} -> {attr_name}" + ) + else: + log.error( + f"Attribute is meant for a complex-valued dataset," + " but is attached to a non-complex-valued dataset" + f" Path: {item_name} -> {attr_name}" + ) + + else: + _validate_string_logic( + name=f"{item_name} -> {attr_name}", + dtype_=attr_id.dtype, + value_provider=lambda: attr_val, + label="Attribute", + ) + + def visitor_func(path: str) -> None: + """Visitor function for h5py.visit.""" + + # The `complex64` HDF5 object is neither a HDF5 group nor dataset, skip. + if path.endswith("complex64"): + return + + obj = h5_file[path] + + # 1. Always check attributes (This is safe for large datasets) + _check_attributes(path, obj) + + # 2. Dataset-specific validation + if isinstance(obj, h5py.Dataset): + + # 2a.For all datasets (numeric, string, etc.) check if the dataset + # was populated with some value. (aka not an empty/null dataset) + + # Check if dataset is a 'null' space (Empty) without reading data. + # This occurs is when a dataset is written with a Python value of + # `None` (there could be other causes). h5py Datasets with no data + # have a shape of None or use the Empty class. + if obj.shape is None: + msg = f"Dataset has a null (Empty) space. Dataset: {obj.name}" + log.error(msg) + return + + # Check if storage was allocated (0 bytes means uninitialized/empty) + if obj.id.get_storage_size() == 0: + log.error( + "Dataset has no allocated storage (empty)." + f" Dataset: {obj.name}" + ) + return + + # 2b. String Type/Content Check + _validate_string_logic( + name=obj.name, + dtype_=obj.dtype, + # Only called if dtype is string + value_provider=lambda: obj[()], + label="Dataset", + ) + + # 2c. Numeric Type/Content Check + # Numeric datasets will need to be individually validated + # via other sections in QA (XML Checker, qa_reports, Metadata LUT + # checks, etc.) + + # Check root, then visit + _check_attributes("/", h5_file) + h5_file.visit(visitor_func) + + def identification_sanity_checks( id_group: h5py.Group, product_type: str ) -> None: @@ -443,25 +679,13 @@ def _verify_data_is_in_list( if _dataset_exists(ds_name): data = _get_string_dataset(ds_name=ds_name) if data is not None: - # TODO: Use a regex for more flexible pattern matching. - if data in ( - "", - "0", - "['0']", - "['']", - "['' '' '' '' '']", - "None", - "(NOT SPECIFIED)", - ): - log.error( - f"Dataset value is {data!r}, which is not a valid value." - f" Dataset: {_full_path(ds_name)}" - ) + ds_full_path = _full_path(ds_name) + if _log_if_bad_string_value(val=data, path=ds_full_path): passes = False else: log.warning( f"Dataset value is {data!r}, but it has not be automatically" - f" verified during checks. Dataset: {_full_path(ds_name)}" + f" verified during checks. Dataset: {ds_full_path}" ) else: passes = False