From 5d9c9523555c9da419664b04a12a226b03c0933c Mon Sep 17 00:00:00 2001 From: Akshat Gupta Date: Sat, 28 Mar 2026 10:25:53 +0000 Subject: [PATCH 1/3] Add wildcard/glob pattern support for exclude_paths and include_paths --- deepdiff/deephash.py | 35 +- deepdiff/diff.py | 2364 +++++++------------------------------- deepdiff/helper.py | 24 + deepdiff/path.py | 161 +++ deepdiff/search.py | 8 +- docs/deephash_doc.rst | 2 + docs/diff_doc.rst | 4 +- docs/exclude_paths.rst | 43 + docs/search_doc.rst | 426 ++++++- tests/test_glob_paths.py | 719 ++++++++++++ 10 files changed, 1773 insertions(+), 2013 deletions(-) create mode 100644 tests/test_glob_paths.py diff --git a/deepdiff/deephash.py b/deepdiff/deephash.py index d26338e2..1ecc22a4 100644 --- a/deepdiff/deephash.py +++ b/deepdiff/deephash.py @@ -14,7 +14,8 @@ convert_item_or_items_into_compiled_regexes_else_none, get_id, type_is_subclass_of_type_group, type_in_type_group, number_to_string, datetime_normalize, KEY_TO_VAL_STR, - get_truncate_datetime, dict_, add_root_to_paths, PydanticBaseModel) + get_truncate_datetime, dict_, add_root_to_paths, PydanticBaseModel, + separate_wildcard_and_exact_paths) from deepdiff.base import Base @@ -189,6 +190,7 @@ def __init__(self, custom_operators: Optional[List[Any]] = None, default_timezone: Union[datetime.timezone, "BaseTzInfo"] = datetime.timezone.utc, encodings: Optional[List[str]] = None, + exclude_glob_paths: Optional[List[Any]] = None, exclude_obj_callback: Optional[Callable[[Any, str], bool]] = None, exclude_paths: Optional[PathType] = None, exclude_regex_paths: Optional[RegexType] = None, @@ -205,6 +207,7 @@ def __init__(self, ignore_type_in_groups: Any = None, ignore_type_subclasses: bool = False, ignore_uuid_types: bool = False, + include_glob_paths: Optional[List[Any]] = None, include_paths: Optional[PathType] = None, number_format_notation: str = "f", number_to_string_func: Optional[NumberToStringFunc] = None, @@ -231,8 +234,14 @@ def __init__(self, exclude_types = set() if exclude_types is None else set(exclude_types) self.exclude_types_tuple = tuple(exclude_types) # we need tuple for checking isinstance self.ignore_repetition = ignore_repetition - self.exclude_paths = add_root_to_paths(convert_item_or_items_into_set_else_none(exclude_paths)) - self.include_paths = add_root_to_paths(convert_item_or_items_into_set_else_none(include_paths)) + _exclude_set = convert_item_or_items_into_set_else_none(exclude_paths) + _exclude_exact, _exclude_globs = separate_wildcard_and_exact_paths(_exclude_set) + self.exclude_paths = add_root_to_paths(_exclude_exact) + self.exclude_glob_paths = exclude_glob_paths or _exclude_globs + _include_set = convert_item_or_items_into_set_else_none(include_paths) + _include_exact, _include_globs = separate_wildcard_and_exact_paths(_include_set) + self.include_paths = add_root_to_paths(_include_exact) + self.include_glob_paths = include_glob_paths or _include_globs self.exclude_regex_paths = convert_item_or_items_into_compiled_regexes_else_none(exclude_regex_paths) self.hasher = default_hasher if hasher is None else hasher self.hashes[UNPROCESSED_KEY] = [] # type: ignore @@ -461,11 +470,21 @@ def _skip_this(self, obj: Any, parent: str) -> bool: skip = False if self.exclude_paths and parent in self.exclude_paths: skip = True - if self.include_paths and parent != 'root': - if parent not in self.include_paths: - skip = True - for prefix in self.include_paths: - if parent.startswith(prefix): + elif self.exclude_glob_paths and any(gp.match(parent) for gp in self.exclude_glob_paths): + skip = True + if (self.include_paths or self.include_glob_paths) and parent != 'root': + skip = True + if self.include_paths: + if parent in self.include_paths: + skip = False + else: + for prefix in self.include_paths: + if parent.startswith(prefix): + skip = False + break + if skip and self.include_glob_paths: + for gp in self.include_glob_paths: + if gp.match_or_is_ancestor(parent): skip = False break elif self.exclude_regex_paths and any( diff --git a/deepdiff/diff.py b/deepdiff/diff.py index 2931cefd..2ac62b5e 100755 --- a/deepdiff/diff.py +++ b/deepdiff/diff.py @@ -1,2012 +1,484 @@ -#!/usr/bin/env python - -# In order to run the docstrings: -# python3 -m deepdiff.diff -# You might need to run it many times since dictionaries come in different orders -# every time you run the docstrings. -# However the docstring expects it in a specific order in order to pass! -import difflib +import re import logging -import types -import datetime -import uuid -from enum import Enum -from copy import deepcopy -from math import isclose as is_close -from typing import List, Dict, Callable, Union, Any, Pattern, Tuple, Optional, Set, FrozenSet, TYPE_CHECKING, Protocol, Literal -from collections.abc import Mapping, Iterable, Sequence -from collections import defaultdict -from inspect import getmembers -from itertools import zip_longest +from ast import literal_eval from functools import lru_cache -from deepdiff.helper import (strings, bytes_type, numbers, uuids, ListItemRemovedOrAdded, notpresent, - IndexedHash, unprocessed, add_to_frozen_set, basic_types, - convert_item_or_items_into_set_else_none, get_type, - convert_item_or_items_into_compiled_regexes_else_none, - type_is_subclass_of_type_group, type_in_type_group, get_doc, - number_to_string, datetime_normalize, KEY_TO_VAL_STR, booleans, - np_ndarray, np_floating, get_numpy_ndarray_rows, RepeatedTimer, - TEXT_VIEW, TREE_VIEW, DELTA_VIEW, COLORED_VIEW, COLORED_COMPACT_VIEW, - detailed__dict__, add_root_to_paths, - np, get_truncate_datetime, dict_, CannotCompare, ENUM_INCLUDE_KEYS, - PydanticBaseModel, Opcode, SetOrdered, ipranges) -from deepdiff.serialization import SerializationMixin -from deepdiff.distance import DistanceMixin, logarithmic_similarity -from deepdiff.model import ( - RemapDict, ResultDict, TextResult, TreeResult, DiffLevel, - DictRelationship, AttributeRelationship, REPORT_KEYS, - SubscriptableIterableRelationship, NonSubscriptableIterableRelationship, - SetRelationship, NumpyArrayRelationship, CUSTOM_FIELD, - FORCE_DEFAULT, -) -from deepdiff.deephash import DeepHash, combine_hashes_lists -from deepdiff.base import Base -from deepdiff.lfucache import LFUCache, DummyLFU -from deepdiff.colored_view import ColoredView - -if TYPE_CHECKING: - from pytz.tzinfo import BaseTzInfo - logger = logging.getLogger(__name__) -MAX_PASSES_REACHED_MSG = ( - 'DeepDiff has reached the max number of passes of {}. ' - 'You can possibly get more accurate results by increasing the max_passes parameter.') +GETATTR = 'GETATTR' +GET = 'GET' -MAX_DIFFS_REACHED_MSG = ( - 'DeepDiff has reached the max number of diffs of {}. ' - 'You can possibly get more accurate results by increasing the max_diffs parameter.') +class _WildcardToken: + """Sentinel object for wildcard path tokens. -notpresent_indexed = IndexedHash(indexes=[0], item=notpresent) + Using a dedicated class (instead of plain strings) ensures that a literal + dict key ``'*'`` (parsed from ``root['*']``) is never confused with the + wildcard ``*`` (parsed from ``root[*]``). + """ + def __init__(self, symbol): + self._symbol = symbol -doc = get_doc('diff_doc.rst') + def __repr__(self): + return self._symbol + def __eq__(self, other): + return isinstance(other, _WildcardToken) and self._symbol == other._symbol -PROGRESS_MSG = "DeepDiff {} seconds in progress. Pass #{}, Diff #{}" + def __hash__(self): + return hash(('_WildcardToken', self._symbol)) -def _report_progress(_stats: Dict[str, Any], progress_logger: Callable[[str], None], duration: float) -> None: - """ - Report the progress every few seconds. - """ - progress_logger(PROGRESS_MSG.format(duration, _stats[PASSES_COUNT], _stats[DIFF_COUNT])) - - -DISTANCE_CACHE_HIT_COUNT = 'DISTANCE CACHE HIT COUNT' -DIFF_COUNT = 'DIFF COUNT' -PASSES_COUNT = 'PASSES COUNT' -MAX_PASS_LIMIT_REACHED = 'MAX PASS LIMIT REACHED' -MAX_DIFF_LIMIT_REACHED = 'MAX DIFF LIMIT REACHED' -DISTANCE_CACHE_ENABLED = 'DISTANCE CACHE ENABLED' -PREVIOUS_DIFF_COUNT = 'PREVIOUS DIFF COUNT' -PREVIOUS_DISTANCE_CACHE_HIT_COUNT = 'PREVIOUS DISTANCE CACHE HIT COUNT' -CANT_FIND_NUMPY_MSG = 'Unable to import numpy. This must be a bug in DeepDiff since a numpy array is detected.' -INVALID_VIEW_MSG = "view parameter must be one of 'text', 'tree', 'delta', 'colored' or 'colored_compact'. But {} was passed." -CUTOFF_RANGE_ERROR_MSG = 'cutoff_distance_for_pairs needs to be a positive float max 1.' -VERBOSE_LEVEL_RANGE_MSG = 'verbose_level should be 0, 1, or 2.' -PURGE_LEVEL_RANGE_MSG = 'cache_purge_level should be 0, 1, or 2.' -_ENABLE_CACHE_EVERY_X_DIFF = '_ENABLE_CACHE_EVERY_X_DIFF' - -model_fields_set = frozenset(["model_fields_set"]) - - -# What is the threshold to consider 2 items to be pairs. Only used when ignore_order = True. -CUTOFF_DISTANCE_FOR_PAIRS_DEFAULT = 0.3 - -# What is the threshold to calculate pairs of items between 2 iterables. -# For example 2 iterables that have nothing in common, do not need their pairs to be calculated. -CUTOFF_INTERSECTION_FOR_PAIRS_DEFAULT = 0.7 - -DEEPHASH_PARAM_KEYS = ( - 'exclude_types', - 'exclude_paths', - 'include_paths', - 'exclude_regex_paths', - 'hasher', - 'significant_digits', - 'number_format_notation', - 'ignore_string_type_changes', - 'ignore_numeric_type_changes', - 'ignore_uuid_types', - 'use_enum_value', - 'ignore_type_in_groups', - 'ignore_type_subclasses', - 'ignore_string_case', - 'exclude_obj_callback', - 'ignore_private_variables', - 'encodings', - 'ignore_encoding_errors', - 'default_timezone', - 'custom_operators', -) +SINGLE_WILDCARD = _WildcardToken('*') +MULTI_WILDCARD = _WildcardToken('**') -class DeepDiffProtocol(Protocol): - t1: Any - t2: Any - cutoff_distance_for_pairs: float - use_log_scale: bool - log_scale_similarity_threshold: float - view: str - math_epsilon: Optional[float] - - - -class DeepDiff(ResultDict, SerializationMixin, DistanceMixin, DeepDiffProtocol, Base): - __doc__ = doc - - CACHE_AUTO_ADJUST_THRESHOLD = 0.25 - - def __init__(self, - t1: Any, - t2: Any, - _original_type: Optional[Any]=None, - cache_purge_level: int=1, - cache_size: int=0, - cache_tuning_sample_size: int=0, - custom_operators: Optional[List[Any]] =None, - cutoff_distance_for_pairs: float=CUTOFF_DISTANCE_FOR_PAIRS_DEFAULT, - cutoff_intersection_for_pairs: float=CUTOFF_INTERSECTION_FOR_PAIRS_DEFAULT, - default_timezone:Union[datetime.timezone, "BaseTzInfo"]=datetime.timezone.utc, - encodings: Optional[List[str]]=None, - exclude_obj_callback: Optional[Callable]=None, - exclude_obj_callback_strict: Optional[Callable]=None, - exclude_paths: Union[str, List[str], Set[str], FrozenSet[str], None]=None, - exclude_regex_paths: Union[str, List[str], Pattern[str], List[Pattern[str]], None]=None, - exclude_types: Optional[List[type]]=None, - get_deep_distance: bool=False, - group_by: Union[str, Tuple[str, str], Callable, None]=None, - group_by_sort_key: Union[str, Callable, None]=None, - hasher: Optional[Callable]=None, - hashes: Optional[Dict[Any, Any]]=None, - ignore_encoding_errors: bool=False, - ignore_nan_inequality: bool=False, - ignore_numeric_type_changes: bool=False, - ignore_order: bool=False, - ignore_order_func: Optional[Callable]=None, - ignore_private_variables: bool=True, - ignore_string_case: bool=False, - ignore_string_type_changes: bool=False, - ignore_type_in_groups: Optional[List[Tuple[Any, ...]]]=None, - ignore_type_subclasses: bool=False, - ignore_uuid_types: bool=False, - include_obj_callback: Optional[Callable]=None, - include_obj_callback_strict: Optional[Callable]=None, - include_paths: Union[str, List[str], None]=None, - iterable_compare_func: Optional[Callable]=None, - log_frequency_in_sec: int=0, - log_scale_similarity_threshold: float=0.1, - log_stacktrace: bool=False, - math_epsilon: Optional[float]=None, - max_diffs: Optional[int]=None, - max_passes: int=10000000, - number_format_notation: Literal["f", "e"]="f", - number_to_string_func: Optional[Callable]=None, - progress_logger: Callable[[str], None]=logger.info, - report_repetition: bool=False, - significant_digits: Optional[int]=None, - threshold_to_diff_deeper: float = 0.33, - truncate_datetime: Optional[str]=None, - use_enum_value: bool=False, - use_log_scale: bool=False, - verbose_level: int=1, - view: str=TEXT_VIEW, - zip_ordered_iterables: bool=False, - _parameters: Optional[Dict[str, Any]]=None, - _shared_parameters: Optional[Dict[str, Any]]=None, - **kwargs): - super().__init__() - if kwargs: - raise ValueError(( - "The following parameter(s) are not valid: %s\n" - "The valid parameters are ignore_order, report_repetition, significant_digits, " - "number_format_notation, exclude_paths, include_paths, exclude_types, exclude_regex_paths, ignore_type_in_groups, " - "ignore_string_type_changes, ignore_numeric_type_changes, ignore_type_subclasses, ignore_uuid_types, truncate_datetime, " - "ignore_private_variables, ignore_nan_inequality, number_to_string_func, verbose_level, " - "view, hasher, hashes, max_passes, max_diffs, zip_ordered_iterables, " - "cutoff_distance_for_pairs, cutoff_intersection_for_pairs, log_frequency_in_sec, cache_size, " - "cache_tuning_sample_size, get_deep_distance, group_by, group_by_sort_key, cache_purge_level, log_stacktrace," - "math_epsilon, iterable_compare_func, use_enum_value, _original_type, threshold_to_diff_deeper, default_timezone " - "ignore_order_func, custom_operators, encodings, ignore_encoding_errors, use_log_scale, log_scale_similarity_threshold " - "_parameters and _shared_parameters.") % ', '.join(kwargs.keys())) - - if _parameters: - self.__dict__.update(_parameters) - else: - self.custom_operators = custom_operators or [] - self.ignore_order = ignore_order - - self.ignore_order_func = ignore_order_func - - ignore_type_in_groups = ignore_type_in_groups or [] - if numbers == ignore_type_in_groups or numbers in ignore_type_in_groups: - ignore_numeric_type_changes = True - self.ignore_numeric_type_changes = ignore_numeric_type_changes - if strings == ignore_type_in_groups or strings in ignore_type_in_groups: - ignore_string_type_changes = True - # Handle ignore_uuid_types - check if uuid+str group is already in ignore_type_in_groups - uuid_str_group = (uuids[0], str) - if uuid_str_group == ignore_type_in_groups or uuid_str_group in ignore_type_in_groups: - ignore_uuid_types = True - self.ignore_uuid_types = ignore_uuid_types - self.use_enum_value = use_enum_value - self.log_scale_similarity_threshold = log_scale_similarity_threshold - self.use_log_scale = use_log_scale - self.default_timezone = default_timezone - self.log_stacktrace = log_stacktrace - self.threshold_to_diff_deeper = threshold_to_diff_deeper - self.ignore_string_type_changes = ignore_string_type_changes - self.ignore_type_in_groups = self.get_ignore_types_in_groups( - ignore_type_in_groups=ignore_type_in_groups, - ignore_string_type_changes=ignore_string_type_changes, - ignore_numeric_type_changes=ignore_numeric_type_changes, - ignore_type_subclasses=ignore_type_subclasses, - ignore_uuid_types=ignore_uuid_types) - self.report_repetition = report_repetition - self.exclude_paths = add_root_to_paths(convert_item_or_items_into_set_else_none(exclude_paths)) - self.include_paths = add_root_to_paths(convert_item_or_items_into_set_else_none(include_paths)) - self.exclude_regex_paths = convert_item_or_items_into_compiled_regexes_else_none(exclude_regex_paths) - self.exclude_types = set(exclude_types) if exclude_types else None - self.exclude_types_tuple = tuple(exclude_types) if exclude_types else None # we need tuple for checking isinstance - self.ignore_type_subclasses = ignore_type_subclasses - self.type_check_func = type_in_type_group if ignore_type_subclasses else type_is_subclass_of_type_group - self.ignore_string_case = ignore_string_case - self.exclude_obj_callback = exclude_obj_callback - self.exclude_obj_callback_strict = exclude_obj_callback_strict - self.include_obj_callback = include_obj_callback - self.include_obj_callback_strict = include_obj_callback_strict - self.number_to_string = number_to_string_func or number_to_string - self.iterable_compare_func = iterable_compare_func - self.zip_ordered_iterables = zip_ordered_iterables - self.ignore_private_variables = ignore_private_variables - self.ignore_nan_inequality = ignore_nan_inequality - self.hasher = hasher - self.cache_tuning_sample_size = cache_tuning_sample_size - self.group_by = group_by - if callable(group_by_sort_key): - self.group_by_sort_key = group_by_sort_key - elif group_by_sort_key: - def _group_by_sort_key(x): - return x[group_by_sort_key] - self.group_by_sort_key = _group_by_sort_key - else: - self.group_by_sort_key = None - self.encodings = encodings - self.ignore_encoding_errors = ignore_encoding_errors - - self.significant_digits = self.get_significant_digits(significant_digits, ignore_numeric_type_changes) - self.math_epsilon = math_epsilon - if self.math_epsilon is not None and self.ignore_order: - logger.warning("math_epsilon in conjunction with ignore_order=True is only used for flat object comparisons. Custom math_epsilon will not have an effect when comparing nested objects.") - self.truncate_datetime = get_truncate_datetime(truncate_datetime) - self.number_format_notation = number_format_notation - if verbose_level in {0, 1, 2}: - self.verbose_level = verbose_level - else: - raise ValueError(VERBOSE_LEVEL_RANGE_MSG) - if cache_purge_level not in {0, 1, 2}: - raise ValueError(PURGE_LEVEL_RANGE_MSG) - self.view = view - # Setting up the cache for dynamic programming. One dictionary per instance of root of DeepDiff running. - self.max_passes = max_passes - self.max_diffs = max_diffs - self.cutoff_distance_for_pairs = float(cutoff_distance_for_pairs) - self.cutoff_intersection_for_pairs = float(cutoff_intersection_for_pairs) - if self.cutoff_distance_for_pairs < 0 or self.cutoff_distance_for_pairs > 1: - raise ValueError(CUTOFF_RANGE_ERROR_MSG) - # _Parameters are the clean _parameters to initialize DeepDiff with so we avoid all the above - # cleaning functionalities when running DeepDiff recursively. - # However DeepHash has its own set of _parameters that are slightly different than DeepDIff. - # DeepDiff _parameters are transformed to DeepHash _parameters via _get_deephash_params method. - self.progress_logger = progress_logger - self.cache_size = cache_size - _parameters = self.__dict__.copy() - _parameters['group_by'] = None # overwriting since these parameters will be passed on to other passes. - if log_stacktrace: - self.log_err = logger.exception - else: - self.log_err = logger.error - - # Non-Root - if _shared_parameters: - self.is_root = False - self._shared_parameters = _shared_parameters - self.__dict__.update(_shared_parameters) - # We are in some pass other than root - progress_timer = None - # Root - else: - self.is_root = True - # Caching the DeepDiff results for dynamic programming - self._distance_cache = LFUCache(cache_size) if cache_size else DummyLFU() - self._stats = { - PASSES_COUNT: 0, - DIFF_COUNT: 0, - DISTANCE_CACHE_HIT_COUNT: 0, - PREVIOUS_DIFF_COUNT: 0, - PREVIOUS_DISTANCE_CACHE_HIT_COUNT: 0, - MAX_PASS_LIMIT_REACHED: False, - MAX_DIFF_LIMIT_REACHED: False, - DISTANCE_CACHE_ENABLED: bool(cache_size), - } - self.hashes = dict_() if hashes is None else hashes - self._numpy_paths = dict_() # if _numpy_paths is None else _numpy_paths - self.group_by_keys = set() # Track keys that originated from group_by operations - self._shared_parameters = { - 'hashes': self.hashes, - '_stats': self._stats, - '_distance_cache': self._distance_cache, - 'group_by_keys': self.group_by_keys, - '_numpy_paths': self._numpy_paths, - _ENABLE_CACHE_EVERY_X_DIFF: self.cache_tuning_sample_size * 10, - } - if log_frequency_in_sec: - # Creating a progress log reporter that runs in a separate thread every log_frequency_in_sec seconds. - progress_timer = RepeatedTimer(log_frequency_in_sec, _report_progress, self._stats, progress_logger) - else: - progress_timer = None - - self._parameters = _parameters - self.deephash_parameters = self._get_deephash_params() - self.tree = TreeResult() - self._iterable_opcodes = {} - if group_by and self.is_root: - try: - original_t1 = t1 - t1 = self._group_iterable_to_dict(t1, group_by, item_name='t1') - except (KeyError, ValueError): - pass - else: - try: - t2 = self._group_iterable_to_dict(t2, group_by, item_name='t2') - except (KeyError, ValueError): - t1 = original_t1 - - self.t1 = t1 - self.t2 = t2 - - try: - root = DiffLevel(t1, t2, verbose_level=self.verbose_level) - # _original_type is only used to pass the original type of the data. Currently only used for numpy arrays. - # The reason is that we convert the numpy array to python list and then later for distance calculations - # we convert only the the last dimension of it into numpy arrays. - self._diff(root, parents_ids=frozenset({id(t1)}), _original_type=_original_type) - - if get_deep_distance and view in {TEXT_VIEW, TREE_VIEW}: - self.tree['deep_distance'] = self._get_rough_distance() - - self.tree.remove_empty_keys() - view_results = self._get_view_results(self.view) - if isinstance(view_results, ColoredView): - self.update(view_results.tree) - self._colored_view = view_results - else: - self.update(view_results) - finally: - if self.is_root: - if cache_purge_level: - del self._distance_cache - del self.hashes - del self._shared_parameters - del self._parameters - for key in (PREVIOUS_DIFF_COUNT, PREVIOUS_DISTANCE_CACHE_HIT_COUNT, - DISTANCE_CACHE_ENABLED): - del self._stats[key] - if progress_timer: - duration = progress_timer.stop() - self._stats['DURATION SEC'] = duration - logger.info('stats {}'.format(self.get_stats())) - if cache_purge_level == 2: - self.__dict__.clear() - - def _get_deephash_params(self): - result = {key: self._parameters[key] for key in DEEPHASH_PARAM_KEYS} - result['ignore_repetition'] = not self.report_repetition - result['number_to_string_func'] = self.number_to_string - return result - - def _report_result(self, report_type, change_level, local_tree=None): - """ - Add a detected change to the reference-style result dictionary. - report_type will be added to level. - (We'll create the text-style report from there later.) - :param report_type: A well defined string key describing the type of change. - Examples: "set_item_added", "values_changed" - :param change_level: A DiffLevel object describing the objects in question in their - before-change and after-change object structure. - - :local_tree: None - """ +class PathExtractionError(ValueError): + pass - if not self._skip_this(change_level): - change_level.report_type = report_type - tree = self.tree if local_tree is None else local_tree - tree[report_type].add(change_level) - def custom_report_result(self, report_type, level, extra_info=None): - """ - Add a detected change to the reference-style result dictionary. - report_type will be added to level. - (We'll create the text-style report from there later.) - - :param report_type: A well defined string key describing the type of change. - Examples: "set_item_added", "values_changed" - :param parent: A DiffLevel object describing the objects in question in their - before-change and after-change object structure. - :param extra_info: A dict that describe this result - :rtype: None - """ +class RootCanNotBeModified(ValueError): + pass - if not self._skip_this(level): - level.report_type = report_type - level.additional[CUSTOM_FIELD] = extra_info - self.tree[report_type].add(level) - @staticmethod - def _dict_from_slots(object: Any) -> Dict[str, Any]: - def unmangle(attribute: str) -> str: - if attribute.startswith('__') and attribute != '__weakref__': - return '_{type}{attribute}'.format( - type=type(object).__name__, - attribute=attribute - ) - return attribute - - all_slots = [] - - if isinstance(object, type): - mro = object.__mro__ # pragma: no cover. I have not been able to write a test for this case. But we still check for it. +def _add_to_elements(elements, elem, inside): + # Ignore private items + if not elem: + return + if not elem.startswith('__'): + # Handle wildcard tokens (* and **) as-is. + # Unquoted root[*] arrives as bare '*' which matches the string check. + # Quoted root['*'] arrives as "'*'" which does NOT match, so it falls + # through to literal_eval and becomes the plain string '*' — which is + # distinct from the _WildcardToken sentinel and thus treated as a + # literal dict key. + if elem in ('*', '**'): + action = GETATTR if inside == '.' else GET + elements.append((SINGLE_WILDCARD if elem == '*' else MULTI_WILDCARD, action)) + return + remove_quotes = False + if '𝆺𝅥𝅯' in elem or '\\' in elem: + remove_quotes = True else: - mro = object.__class__.__mro__ + try: + elem = literal_eval(elem) + remove_quotes = False + except (ValueError, SyntaxError): + remove_quotes = True + if remove_quotes and elem[0] == elem[-1] and elem[0] in {'"', "'"}: + elem = elem[1: -1] + action = GETATTR if inside == '.' else GET + elements.append((elem, action)) - for type_in_mro in mro: - slots = getattr(type_in_mro, '__slots__', None) - if slots: - if isinstance(slots, strings): - all_slots.append(slots) - else: - all_slots.extend(slots) - - return {i: getattr(object, key) for i in all_slots if hasattr(object, key := unmangle(i))} - - def _diff_enum(self, level: Any, parents_ids: FrozenSet[int]=frozenset(), local_tree: Optional[Any]=None) -> None: - t1 = detailed__dict__(level.t1, include_keys=ENUM_INCLUDE_KEYS) - t2 = detailed__dict__(level.t2, include_keys=ENUM_INCLUDE_KEYS) - - self._diff_dict( - level, - parents_ids, - print_as_attribute=True, - override=True, - override_t1=t1, - override_t2=t2, - local_tree=local_tree, - ) - - def _diff_obj(self, level: Any, parents_ids: FrozenSet[int]=frozenset(), is_namedtuple: bool=False, local_tree: Optional[Any]=None, is_pydantic_object: bool=False) -> None: - """Difference of 2 objects""" - processing_error = False - t1: Optional[Dict[str, Any]] = None - t2: Optional[Dict[str, Any]] = None - try: - if is_namedtuple: - t1 = level.t1._asdict() - t2 = level.t2._asdict() - elif is_pydantic_object: - t1 = detailed__dict__(level.t1, ignore_private_variables=self.ignore_private_variables, ignore_keys=model_fields_set) - t2 = detailed__dict__(level.t2, ignore_private_variables=self.ignore_private_variables, ignore_keys=model_fields_set) - elif all('__dict__' in dir(t) for t in level): - t1 = detailed__dict__(level.t1, ignore_private_variables=self.ignore_private_variables) - t2 = detailed__dict__(level.t2, ignore_private_variables=self.ignore_private_variables) - elif all('__slots__' in dir(t) for t in level): - t1 = self._dict_from_slots(level.t1) - t2 = self._dict_from_slots(level.t2) - else: - t1 = {k: v for k, v in getmembers(level.t1) if not callable(v)} - t2 = {k: v for k, v in getmembers(level.t2) if not callable(v)} - except AttributeError: - processing_error = True - if processing_error is True or t1 is None or t2 is None: - self._report_result('unprocessed', level, local_tree=local_tree) - return - self._diff_dict( - level, - parents_ids, - print_as_attribute=True, - override=True, - override_t1=t1, - override_t2=t2, - local_tree=local_tree, - ) - - def _skip_this(self, level: Any) -> bool: - """ - Check whether this comparison should be skipped because one of the objects to compare meets exclusion criteria. - :rtype: bool - """ - level_path = level.path() - skip = False - if self.exclude_paths and level_path in self.exclude_paths: - skip = True - if self.include_paths and level_path != 'root': - if level_path not in self.include_paths: - skip = True - for prefix in self.include_paths: - if prefix in level_path or level_path in prefix: - skip = False - break - elif self.exclude_regex_paths and any( - [exclude_regex_path.search(level_path) for exclude_regex_path in self.exclude_regex_paths]): - skip = True - elif self.exclude_types_tuple and \ - (isinstance(level.t1, self.exclude_types_tuple) or isinstance(level.t2, self.exclude_types_tuple)): - skip = True - elif self.exclude_obj_callback and \ - (self.exclude_obj_callback(level.t1, level_path) or self.exclude_obj_callback(level.t2, level_path)): - skip = True - elif self.exclude_obj_callback_strict and \ - (self.exclude_obj_callback_strict(level.t1, level_path) and - self.exclude_obj_callback_strict(level.t2, level_path)): - skip = True - elif self.include_obj_callback and level_path != 'root': - skip = True - if (self.include_obj_callback(level.t1, level_path) or self.include_obj_callback(level.t2, level_path)): - skip = False - elif self.include_obj_callback_strict and level_path != 'root': - skip = True - if (self.include_obj_callback_strict(level.t1, level_path) and - self.include_obj_callback_strict(level.t2, level_path)): - skip = False - - return skip - - def _skip_this_key(self, level: Any, key: Any) -> bool: - # if include_paths is not set, than treet every path as included - if self.include_paths is None: - return False - if "{}['{}']".format(level.path(), key) in self.include_paths: - return False - if level.path() in self.include_paths: - # matches e.g. level+key root['foo']['bar']['veg'] include_paths ["root['foo']['bar']"] - return False - for prefix in self.include_paths: - if "{}['{}']".format(level.path(), key) in prefix: - # matches as long the prefix is longer than this object key - # eg.: level+key root['foo']['bar'] matches prefix root['foo']['bar'] from include paths - # level+key root['foo'] matches prefix root['foo']['bar'] from include_paths - # level+key root['foo']['bar'] DOES NOT match root['foo'] from include_paths This needs to be handled afterwards - return False - # check if a higher level is included as a whole (=without any sublevels specified) - # matches e.g. level+key root['foo']['bar']['veg'] include_paths ["root['foo']"] - # but does not match, if it is level+key root['foo']['bar']['veg'] include_paths ["root['foo']['bar']['fruits']"] - up = level.up - while up is not None: - if up.path() in self.include_paths: - return False - up = up.up - return True +DEFAULT_FIRST_ELEMENT = ('root', GETATTR) - def _get_clean_to_keys_mapping(self, keys: Any, level: Any) -> Dict[Any, Any]: - """ - Get a dictionary of cleaned value of keys to the keys themselves. - This is mainly used to transform the keys when the type changes of keys should be ignored. - TODO: needs also some key conversion for groups of types other than the built-in strings and numbers. - """ - result = dict_() - for key in keys: - if self.ignore_string_type_changes and isinstance(key, bytes): - clean_key = key.decode('utf-8') - elif self.ignore_string_type_changes and isinstance(key, memoryview): - clean_key = key.tobytes().decode('utf-8') - elif self.use_enum_value and isinstance(key, Enum): - clean_key = key.value - elif isinstance(key, numbers): - # Skip type prefixing for keys that originated from group_by operations - if hasattr(self, 'group_by_keys') and key in self.group_by_keys: - if self.significant_digits is None: - clean_key = key - else: - clean_key = self.number_to_string(key, significant_digits=self.significant_digits, - number_format_notation=self.number_format_notation) # type: ignore # type: ignore +@lru_cache(maxsize=1024 * 128) +def _path_to_elements(path, root_element=DEFAULT_FIRST_ELEMENT): + """ + Given a path, it extracts the elements that form the path and their relevant most likely retrieval action. + + >>> from deepdiff import _path_to_elements + >>> path = "root[4.3].b['a3']" + >>> _path_to_elements(path, root_element=None) + [(4.3, 'GET'), ('b', 'GETATTR'), ('a3', 'GET')] + """ + if isinstance(path, (tuple, list)): + return path + elements = [] + if root_element: + elements.append(root_element) + elem = '' + inside = False + prev_char = None + path = path[4:] # removing "root from the beginning" + brackets = [] + inside_quotes = False + quote_used = '' + for char in path: + if prev_char == '𝆺𝅥𝅯': + elem += char + elif char in {'"', "'"}: + elem += char + # If we are inside and the quote is not what we expected, the quote is not closing + if not(inside_quotes and quote_used != char): + inside_quotes = not inside_quotes + if inside_quotes: + quote_used = char else: - type_ = "number" if self.ignore_numeric_type_changes else key.__class__.__name__ - if self.significant_digits is None: - clean_key = key - else: - clean_key = self.number_to_string(key, significant_digits=self.significant_digits, - number_format_notation=self.number_format_notation) # type: ignore # type: ignore - clean_key = KEY_TO_VAL_STR.format(type_, clean_key) + _add_to_elements(elements, elem, inside) + elem = '' + quote_used = '' + elif inside_quotes: + elem += char + elif char == '[': + if inside == '.': + _add_to_elements(elements, elem, inside) + inside = '[' + elem = '' + # we are already inside. The bracket is a part of the word. + elif inside == '[': + elem += char else: - clean_key = key - if self.ignore_string_case and isinstance(clean_key, str): - clean_key = clean_key.lower() - if clean_key in result: - logger.warning(('{} and {} in {} become the same key when ignore_numeric_type_changes' - 'or ignore_numeric_type_changes are set to be true.').format( - key, result[clean_key], level.path())) + inside = '[' + brackets.append('[') + elem = '' + elif char == '.': + if inside == '[': + elem += char + elif inside == '.': + _add_to_elements(elements, elem, inside) + elem = '' else: - result[clean_key] = key - return result - - def _diff_dict( - self, - level: Any, - parents_ids: FrozenSet[int]=frozenset([]), - print_as_attribute: bool=False, - override: bool=False, - override_t1: Optional[Any]=None, - override_t2: Optional[Any]=None, - local_tree: Optional[Any]=None, - ) -> None: - """Difference of 2 dictionaries""" - if override: - # for special stuff like custom objects and named tuples we receive preprocessed t1 and t2 - # but must not spoil the chain (=level) with it - t1 = override_t1 - t2 = override_t2 - else: - t1 = level.t1 - t2 = level.t2 - - if print_as_attribute: - item_added_key = "attribute_added" - item_removed_key = "attribute_removed" - rel_class = AttributeRelationship - else: - item_added_key = "dictionary_item_added" - item_removed_key = "dictionary_item_removed" - rel_class = DictRelationship - - if self.ignore_private_variables: - t1_keys = SetOrdered([key for key in t1 if not(isinstance(key, str) and key.startswith('__')) and not self._skip_this_key(level, key)]) - t2_keys = SetOrdered([key for key in t2 if not(isinstance(key, str) and key.startswith('__')) and not self._skip_this_key(level, key)]) - else: - t1_keys = SetOrdered([key for key in t1 if not self._skip_this_key(level, key)]) - t2_keys = SetOrdered([key for key in t2 if not self._skip_this_key(level, key)]) - if self.ignore_string_type_changes or self.ignore_numeric_type_changes or self.ignore_string_case: - t1_clean_to_keys = self._get_clean_to_keys_mapping(keys=t1_keys, level=level) - t2_clean_to_keys = self._get_clean_to_keys_mapping(keys=t2_keys, level=level) - t1_keys = SetOrdered(t1_clean_to_keys.keys()) - t2_keys = SetOrdered(t2_clean_to_keys.keys()) - else: - t1_clean_to_keys = t2_clean_to_keys = None - - t_keys_intersect = t2_keys & t1_keys - t_keys_added = t2_keys - t_keys_intersect - t_keys_removed = t1_keys - t_keys_intersect - - if self.threshold_to_diff_deeper: - if self.exclude_paths: - t_keys_union = {f"{level.path()}[{repr(key)}]" for key in (t2_keys | t1_keys)} - t_keys_union -= self.exclude_paths - t_keys_union_len = len(t_keys_union) + inside = '.' + elem = '' + elif char == ']': + if brackets and brackets[-1] == '[': + brackets.pop() + if brackets: + elem += char else: - t_keys_union_len = len(t2_keys | t1_keys) - if t_keys_union_len > 1 and len(t_keys_intersect) / t_keys_union_len < self.threshold_to_diff_deeper: - self._report_result('values_changed', level, local_tree=local_tree) - return - - for key in t_keys_added: - if self._count_diff() is StopIteration: - return - - key = t2_clean_to_keys[key] if t2_clean_to_keys else key - change_level = level.branch_deeper( - notpresent, - t2[key], - child_relationship_class=rel_class, - child_relationship_param=key, - child_relationship_param2=key, - ) - self._report_result(item_added_key, change_level, local_tree=local_tree) - - for key in t_keys_removed: - if self._count_diff() is StopIteration: - return # pragma: no cover. This is already covered for addition. - - key = t1_clean_to_keys[key] if t1_clean_to_keys else key - change_level = level.branch_deeper( - t1[key], - notpresent, - child_relationship_class=rel_class, - child_relationship_param=key, - child_relationship_param2=key, - ) - self._report_result(item_removed_key, change_level, local_tree=local_tree) - - for key in t_keys_intersect: # key present in both dicts - need to compare values - if self._count_diff() is StopIteration: - return # pragma: no cover. This is already covered for addition. - - key1 = t1_clean_to_keys[key] if t1_clean_to_keys else key - key2 = t2_clean_to_keys[key] if t2_clean_to_keys else key - item_id = id(t1[key1]) - if parents_ids and item_id in parents_ids: - continue - parents_ids_added = add_to_frozen_set(parents_ids, item_id) - - # Go one level deeper - next_level = level.branch_deeper( - t1[key1], - t2[key2], - child_relationship_class=rel_class, - child_relationship_param=key, - child_relationship_param2=key, - ) - self._diff(next_level, parents_ids_added, local_tree=local_tree) - - def _diff_set(self, level: Any, local_tree: Optional[Any]=None) -> None: - """Difference of sets""" - t1_hashtable = self._create_hashtable(level, 't1') - t2_hashtable = self._create_hashtable(level, 't2') - - t1_hashes = set(t1_hashtable.keys()) - t2_hashes = set(t2_hashtable.keys()) - - hashes_added = t2_hashes - t1_hashes - hashes_removed = t1_hashes - t2_hashes - - items_added = [t2_hashtable[i].item for i in hashes_added] - items_removed = [t1_hashtable[i].item for i in hashes_removed] - - for item in items_added: - if self._count_diff() is StopIteration: - return # pragma: no cover. This is already covered for addition. - - change_level = level.branch_deeper( - notpresent, item, child_relationship_class=SetRelationship) - self._report_result('set_item_added', change_level, local_tree=local_tree) - - for item in items_removed: - if self._count_diff() is StopIteration: - return # pragma: no cover. This is already covered for addition. - - change_level = level.branch_deeper( - item, notpresent, child_relationship_class=SetRelationship) - self._report_result('set_item_removed', change_level, local_tree=local_tree) - - @staticmethod - def _iterables_subscriptable(t1: Any, t2: Any) -> bool: - try: - if getattr(t1, '__getitem__') and getattr(t2, '__getitem__'): - return True - else: # pragma: no cover - return False # should never happen - except AttributeError: - return False - - def _diff_iterable(self, level: Any, parents_ids: FrozenSet[int]=frozenset(), _original_type: Optional[type]=None, local_tree: Optional[Any]=None) -> None: - """Difference of iterables""" - if (self.ignore_order_func and self.ignore_order_func(level)) or self.ignore_order: - self._diff_iterable_with_deephash(level, parents_ids, _original_type=_original_type, local_tree=local_tree) - else: - self._diff_iterable_in_order(level, parents_ids, _original_type=_original_type, local_tree=local_tree) - - def _compare_in_order( - self, level, - t1_from_index=None, t1_to_index=None, - t2_from_index=None, t2_to_index=None - ) -> List[Tuple[Tuple[int, int], Tuple[Any, Any]]]: - """ - Default compare if `iterable_compare_func` is not provided. - This will compare in sequence order. - """ - if t1_from_index is None: - return [((i, i), (x, y)) for i, (x, y) in enumerate( - zip_longest( - level.t1, level.t2, fillvalue=ListItemRemovedOrAdded))] + _add_to_elements(elements, elem, inside) + elem = '' + inside = False else: - t1_chunk = level.t1[t1_from_index:t1_to_index] - t2_chunk = level.t2[t2_from_index:t2_to_index] - return [((i + t1_from_index, i + t2_from_index), (x, y)) for i, (x, y) in enumerate( - zip_longest( - t1_chunk, t2_chunk, fillvalue=ListItemRemovedOrAdded))] - - def _get_matching_pairs( - self, level, - t1_from_index=None, t1_to_index=None, - t2_from_index=None, t2_to_index=None - ) -> List[Tuple[Tuple[int, int], Tuple[Any, Any]]]: - """ - Given a level get matching pairs. This returns list of two tuples in the form: - [ - (t1 index, t2 index), (t1 item, t2 item) - ] - - This will compare using the passed in `iterable_compare_func` if available. - Default it to compare in order - """ - - if self.iterable_compare_func is None: - # Match in order if there is no compare function provided - return self._compare_in_order( - level, - t1_from_index=t1_from_index, t1_to_index=t1_to_index, - t2_from_index=t2_from_index, t2_to_index=t2_to_index, - ) - try: - matches = [] - y_matched = set() - y_index_matched = set() - for i, x in enumerate(level.t1): - x_found = False - for j, y in enumerate(level.t2): - - if(j in y_index_matched): - # This ensures a one-to-one relationship of matches from t1 to t2. - # If y this index in t2 has already been matched to another x - # it cannot have another match, so just continue. - continue - - if(self.iterable_compare_func(x, y, level)): - deep_hash = DeepHash(y, - hashes=self.hashes, - apply_hash=True, - **self.deephash_parameters, - ) - y_index_matched.add(j) - y_matched.add(deep_hash[y]) - matches.append(((i, j), (x, y))) - x_found = True - break - - if(not x_found): - matches.append(((i, -1), (x, ListItemRemovedOrAdded))) - for j, y in enumerate(level.t2): - - deep_hash = DeepHash(y, - hashes=self.hashes, - apply_hash=True, - **self.deephash_parameters, - ) - if(deep_hash[y] not in y_matched): - matches.append(((-1, j), (ListItemRemovedOrAdded, y))) - return matches - except CannotCompare: - return self._compare_in_order( - level, - t1_from_index=t1_from_index, t1_to_index=t1_to_index, - t2_from_index=t2_from_index, t2_to_index=t2_to_index - ) - - def _diff_iterable_in_order(self, level, parents_ids=frozenset(), _original_type=None, local_tree=None): - # We're handling both subscriptable and non-subscriptable iterables. Which one is it? - subscriptable = self._iterables_subscriptable(level.t1, level.t2) - if subscriptable: - child_relationship_class = SubscriptableIterableRelationship - else: - child_relationship_class = NonSubscriptableIterableRelationship - - if ( - not self.zip_ordered_iterables - and isinstance(level.t1, Sequence) - and isinstance(level.t2, Sequence) - and self._all_values_basic_hashable(level.t1) - and self._all_values_basic_hashable(level.t2) - and self.iterable_compare_func is None - ): - local_tree_pass = TreeResult() - opcodes_with_values = self._diff_ordered_iterable_by_difflib( - level, - parents_ids=parents_ids, - _original_type=_original_type, - child_relationship_class=child_relationship_class, - local_tree=local_tree_pass, - ) - # Sometimes DeepDiff's old iterable diff does a better job than DeepDiff - if len(local_tree_pass) > 1: - local_tree_pass2 = TreeResult() - self._diff_by_forming_pairs_and_comparing_one_by_one( - level, - parents_ids=parents_ids, - _original_type=_original_type, - child_relationship_class=child_relationship_class, - local_tree=local_tree_pass2, - ) - if len(local_tree_pass) >= len(local_tree_pass2): - local_tree_pass = local_tree_pass2 - else: - self._iterable_opcodes[level.path(force=FORCE_DEFAULT)] = opcodes_with_values - for report_type, levels in local_tree_pass.items(): - if levels: - self.tree[report_type] |= levels - else: - self._diff_by_forming_pairs_and_comparing_one_by_one( - level, - parents_ids=parents_ids, - _original_type=_original_type, - child_relationship_class=child_relationship_class, - local_tree=local_tree, - ) - - def _all_values_basic_hashable(self, iterable: Iterable[Any]) -> bool: - """ - Are all items basic hashable types? - Or there are custom types too? - """ - - # We don't want to exhaust a generator - if isinstance(iterable, types.GeneratorType): - return False - for item in iterable: - if not isinstance(item, basic_types): - return False - return True - - def _diff_by_forming_pairs_and_comparing_one_by_one( - self, level, local_tree, parents_ids=frozenset(), - _original_type=None, child_relationship_class=None, - t1_from_index=None, t1_to_index=None, - t2_from_index=None, t2_to_index=None, - ): - for (i, j), (x, y) in self._get_matching_pairs( - level, - t1_from_index=t1_from_index, t1_to_index=t1_to_index, - t2_from_index=t2_from_index, t2_to_index=t2_to_index - ): - if self._count_diff() is StopIteration: - return # pragma: no cover. This is already covered for addition. - - reference_param1 = i - reference_param2 = j - if y is ListItemRemovedOrAdded: # item removed completely - change_level = level.branch_deeper( - x, - notpresent, - child_relationship_class=child_relationship_class, - child_relationship_param=reference_param1, - child_relationship_param2=reference_param2, - ) - self._report_result('iterable_item_removed', change_level, local_tree=local_tree) - - elif x is ListItemRemovedOrAdded: # new item added - change_level = level.branch_deeper( - notpresent, - y, - child_relationship_class=child_relationship_class, - child_relationship_param=reference_param1, - child_relationship_param2=reference_param2, - ) - self._report_result('iterable_item_added', change_level, local_tree=local_tree) - - else: # check if item value has changed - if (i != j and ((x == y) or self.iterable_compare_func)): - # Item moved - change_level = level.branch_deeper( - x, - y, - child_relationship_class=child_relationship_class, - child_relationship_param=reference_param1, - child_relationship_param2=reference_param2 - ) - self._report_result('iterable_item_moved', change_level, local_tree=local_tree) - - if self.iterable_compare_func: - # Mark additional context denoting that we have moved an item. - # This will allow for correctly setting paths relative to t2 when using an iterable_compare_func - level.additional["moved"] = True - - else: - continue - - item_id = id(x) - if parents_ids and item_id in parents_ids: - continue - parents_ids_added = add_to_frozen_set(parents_ids, item_id) - - # Go one level deeper - next_level = level.branch_deeper( - x, - y, - child_relationship_class=child_relationship_class, - child_relationship_param=reference_param1, - child_relationship_param2=reference_param2 - ) - self._diff(next_level, parents_ids_added, local_tree=local_tree) - - def _diff_ordered_iterable_by_difflib( - self, level, local_tree, parents_ids=frozenset(), _original_type=None, child_relationship_class=None, - ): - - seq = difflib.SequenceMatcher(isjunk=None, a=level.t1, b=level.t2, autojunk=False) - - opcodes = seq.get_opcodes() - opcodes_with_values = [] - - # TODO: this logic should be revisted so we detect reverse operations - # like when a replacement happens at index X and a reverse replacement happens at index Y - # in those cases we have a "iterable_item_moved" operation. - for tag, t1_from_index, t1_to_index, t2_from_index, t2_to_index in opcodes: - if tag == 'equal': - opcodes_with_values.append(Opcode( - tag, t1_from_index, t1_to_index, t2_from_index, t2_to_index, - )) - continue - # print('{:7} t1[{}:{}] --> t2[{}:{}] {!r:>8} --> {!r}'.format( - # tag, t1_from_index, t1_to_index, t2_from_index, t2_to_index, level.t1[t1_from_index:t1_to_index], level.t2[t2_from_index:t2_to_index])) - - opcodes_with_values.append(Opcode( - tag, t1_from_index, t1_to_index, t2_from_index, t2_to_index, - old_values = level.t1[t1_from_index: t1_to_index], - new_values = level.t2[t2_from_index: t2_to_index], - )) - - if tag == 'replace': - self._diff_by_forming_pairs_and_comparing_one_by_one( - level, local_tree=local_tree, parents_ids=parents_ids, - _original_type=_original_type, child_relationship_class=child_relationship_class, - t1_from_index=t1_from_index, t1_to_index=t1_to_index, - t2_from_index=t2_from_index, t2_to_index=t2_to_index, - ) - elif tag == 'delete': - for index, x in enumerate(level.t1[t1_from_index:t1_to_index]): - change_level = level.branch_deeper( - x, - notpresent, - child_relationship_class=child_relationship_class, - child_relationship_param=index + t1_from_index, - child_relationship_param2=index + t1_from_index, - ) - self._report_result('iterable_item_removed', change_level, local_tree=local_tree) - elif tag == 'insert': - for index, y in enumerate(level.t2[t2_from_index:t2_to_index]): - change_level = level.branch_deeper( - notpresent, - y, - child_relationship_class=child_relationship_class, - child_relationship_param=index + t2_from_index, - child_relationship_param2=index + t2_from_index, - ) - self._report_result('iterable_item_added', change_level, local_tree=local_tree) - return opcodes_with_values - - - def _diff_str(self, level, local_tree=None): - """Compare strings""" - if self.ignore_string_case: - level.t1 = level.t1.lower() - level.t2 = level.t2.lower() - - if type(level.t1) == type(level.t2) and level.t1 == level.t2: # NOQA - return - - # do we add a diff for convenience? - do_diff = True - t1_str = level.t1 - t2_str = level.t2 - - if isinstance(level.t1, memoryview): - try: - t1_str = level.t1.tobytes().decode('ascii') - except UnicodeDecodeError: - do_diff = False - elif isinstance(level.t1, bytes_type): - try: - t1_str = level.t1.decode('ascii') - except UnicodeDecodeError: - do_diff = False - - if isinstance(level.t2, memoryview): - try: - t2_str = level.t2.tobytes().decode('ascii') - except UnicodeDecodeError: - do_diff = False - elif isinstance(level.t2, bytes_type): + elem += char + prev_char = char + if elem: + _add_to_elements(elements, elem, inside) + return tuple(elements) + + +def _get_nested_obj(obj, elements, next_element=None): + for (elem, action) in elements: + check_elem(elem) + if action == GET: + obj = obj[elem] + elif action == GETATTR: + obj = getattr(obj, elem) + return obj + + +def _guess_type(elements, elem, index, next_element): + # If we are not at the last elements + if index < len(elements) - 1: + # We assume it is a nested dictionary not a nested list + return {} + if isinstance(next_element, int): + return [] + return {} + + +def check_elem(elem): + if isinstance(elem, str) and elem.startswith("__") and elem.endswith("__"): + raise ValueError("traversing dunder attributes is not allowed") + + +def _get_nested_obj_and_force(obj, elements, next_element=None): + prev_elem = None + prev_action = None + prev_obj = obj + for index, (elem, action) in enumerate(elements): + check_elem(elem) + _prev_obj = obj + if action == GET: try: - t2_str = level.t2.decode('ascii') - except UnicodeDecodeError: - do_diff = False - - if isinstance(level.t1, Enum): - t1_str = level.t1.value - - if isinstance(level.t2, Enum): - t2_str = level.t2.value + obj = obj[elem] + prev_obj = _prev_obj + except KeyError: + obj[elem] = _guess_type(elements, elem, index, next_element) + obj = obj[elem] + prev_obj = _prev_obj + except IndexError: + if isinstance(obj, list) and isinstance(elem, int) and elem >= len(obj): + obj.extend([None] * (elem - len(obj))) + obj.append(_guess_type(elements, elem, index), next_element) + obj = obj[-1] + prev_obj = _prev_obj + elif isinstance(obj, list) and len(obj) == 0 and prev_elem: + # We ran into an empty list that should have been a dictionary + # We need to change it from an empty list to a dictionary + obj = {elem: _guess_type(elements, elem, index, next_element)} + if prev_action == GET: + prev_obj[prev_elem] = obj + else: + setattr(prev_obj, prev_elem, obj) + obj = obj[elem] + elif action == GETATTR: + obj = getattr(obj, elem) + prev_obj = _prev_obj + prev_elem = elem + prev_action = action + return obj - if t1_str == t2_str: - return - if do_diff: - if '\n' in t1_str or isinstance(t2_str, str) and '\n' in t2_str: - diff = difflib.unified_diff( - t1_str.splitlines(), t2_str.splitlines(), lineterm='') - diff = list(diff) - if diff: - level.additional['diff'] = '\n'.join(diff) - - self._report_result('values_changed', level, local_tree=local_tree) - - def _diff_tuple(self, level, parents_ids, local_tree=None): - # Checking to see if it has _fields. Which probably means it is a named - # tuple. - try: - level.t1._asdict - # It must be a normal tuple - except AttributeError: - self._diff_iterable(level, parents_ids, local_tree=local_tree) - # We assume it is a namedtuple then - else: - self._diff_obj(level, parents_ids, is_namedtuple=True, local_tree=local_tree) +def extract(obj, path): + """ + Get the item from obj based on path. + + Example: + + >>> from deepdiff import extract + >>> obj = {1: [{'2': 'b'}, 3], 2: [4, 5]} + >>> path = "root[1][0]['2']" + >>> extract(obj, path) + 'b' + + Note that you can use extract in conjunction with DeepDiff results + or even with the search and :ref:`deepsearch_label` modules. For example: + + >>> from deepdiff import grep + >>> obj = {1: [{'2': 'b'}, 3], 2: [4, 5]} + >>> result = obj | grep(5) + >>> result + {'matched_values': ['root[2][1]']} + >>> result['matched_values'][0] + 'root[2][1]' + >>> path = result['matched_values'][0] + >>> extract(obj, path) + 5 + + + .. note:: + Note that even if DeepDiff tried gives you a path to an item in a set, + there is no such thing in Python and hence you will get an error trying + to extract that item from a set. + If you want to be able to get items from sets, use the SetOrdered module + to generate the sets. + In fact Deepdiff uses SetOrdered as a dependency. + + >>> from deepdiff import grep, extract + >>> obj = {"a", "b"} + >>> obj | grep("b") + Set item detected in the path.'set' objects do NOT support indexing. But DeepSearch will still report a path. + {'matched_values': SetOrdered(['root[0]'])} + >>> extract(obj, 'root[0]') + Traceback (most recent call last): + File "", line 1, in + File "deepdiff/deepdiff/path.py", line 126, in extract + return _get_nested_obj(obj, elements) + File "deepdiff/deepdiff/path.py", line 84, in _get_nested_obj + obj = obj[elem] + TypeError: 'set' object is not subscriptable + >>> from orderly_set import SetOrdered + >>> obj = SetOrdered(["a", "b"]) + >>> extract(obj, 'root[0]') + 'a' - def _add_hash(self, hashes, item_hash, item, i): - if item_hash in hashes: - hashes[item_hash].indexes.append(i) - else: - hashes[item_hash] = IndexedHash(indexes=[i], item=item) + """ + elements = _path_to_elements(path, root_element=None) + return _get_nested_obj(obj, elements) - def _create_hashtable(self, level, t): - """Create hashtable of {item_hash: (indexes, item)}""" - obj = getattr(level, t) - local_hashes = dict_() - for (i, item) in enumerate(obj): - try: - parent = "{}[{}]".format(level.path(), i) - # Note: in the DeepDiff we only calculate the hash of items when we have to. - # So self.hashes does not include hashes of all objects in t1 and t2. - # It only includes the ones needed when comparing iterables. - # The self.hashes dictionary gets shared between different runs of DeepHash - # So that any object that is already calculated to have a hash is not re-calculated. - deep_hash = DeepHash( - item, - hashes=self.hashes, - parent=parent, - apply_hash=True, - **self.deephash_parameters, - ) - except UnicodeDecodeError as err: - err.reason = f"Can not produce a hash for {level.path()}: {err.reason}" - raise - except NotImplementedError: - raise - # except Exception as e: # pragma: no cover - # logger.error("Can not produce a hash for %s." - # "Not counting this object.\n %s" % - # (level.path(), e)) - else: - try: - item_hash = deep_hash[item] - except KeyError: - pass - else: - if item_hash is unprocessed: # pragma: no cover - self.log_err("Item %s was not processed while hashing " - "thus not counting this object." % - level.path()) - else: - self._add_hash(hashes=local_hashes, item_hash=item_hash, item=item, i=i) - - # Also we hash the iterables themselves too so that we can later create cache keys from those hashes. - DeepHash( - obj, - hashes=self.hashes, - parent=level.path(), - apply_hash=True, - **self.deephash_parameters, - ) - return local_hashes +def parse_path(path, root_element=DEFAULT_FIRST_ELEMENT, include_actions=False): + """ + Parse a path to a format that is machine readable - @staticmethod - @lru_cache(maxsize=2028) - def _get_distance_cache_key(added_hash, removed_hash): - key1, key2 = (added_hash, removed_hash) if added_hash > removed_hash else (removed_hash, added_hash) - if isinstance(key1, int): - # If the hash function produces integers we convert them to hex values. - # This was used when the default hash function was Murmur3 128bit which produces integers. - key1 = hex(key1).encode('utf-8') - key2 = hex(key2).encode('utf-8') - elif isinstance(key1, str): - key1 = key1.encode('utf-8') - key2 = key2.encode('utf-8') - return key1 + b'--' + key2 + b'dc' - - def _get_rough_distance_of_hashed_objs( - self, added_hash, removed_hash, added_hash_obj, removed_hash_obj, _original_type=None): - # We need the rough distance between the 2 objects to see if they qualify to be pairs or not - _distance = cache_key = None - if self._stats[DISTANCE_CACHE_ENABLED]: - cache_key = self._get_distance_cache_key(added_hash, removed_hash) - if cache_key in self._distance_cache: - self._stats[DISTANCE_CACHE_HIT_COUNT] += 1 - _distance = self._distance_cache.get(cache_key) - if _distance is None: - # We can only cache the rough distance and not the actual diff result for reuse. - # The reason is that we have modified the parameters explicitly so they are different and can't - # be used for diff reporting - diff = DeepDiff( - removed_hash_obj.item, added_hash_obj.item, - _parameters=self._parameters, - _shared_parameters=self._shared_parameters, - view=DELTA_VIEW, - _original_type=_original_type, - iterable_compare_func=self.iterable_compare_func, - ) - _distance = diff._get_rough_distance() - if cache_key and self._stats[DISTANCE_CACHE_ENABLED]: - self._distance_cache.set(cache_key, value=_distance) - return _distance - - def _get_most_in_common_pairs_in_iterables( - self, hashes_added, hashes_removed, t1_hashtable, t2_hashtable, parents_ids, _original_type): - """ - Get the closest pairs between items that are removed and items that are added. + **Parameters** - returns a dictionary of hashes that are closest to each other. - The dictionary is going to be symmetrical so any key will be a value too and otherwise. + path : A string + The path string such as "root[1][2]['age']" - Note that due to the current reporting structure in DeepDiff, we don't compare an item that - was added to an item that is in both t1 and t2. + root_element: string, default='root' + What the root is called in the path. - For example + include_actions: boolean, default=False + If True, we return the action required to retrieve the item at each element of the path. - [{1, 2}, {4, 5, 6}] - [{1, 2}, {1, 2, 3}] + **Examples** - is only compared between {4, 5, 6} and {1, 2, 3} even though technically {1, 2, 3} is - just one item different than {1, 2} + >>> from deepdiff import parse_path + >>> parse_path("root[1][2]['age']") + [1, 2, 'age'] + >>> parse_path("root[1][2]['age']", include_actions=True) + [{'element': 1, 'action': 'GET'}, {'element': 2, 'action': 'GET'}, {'element': 'age', 'action': 'GET'}] + >>> + >>> parse_path("root['joe'].age") + ['joe', 'age'] + >>> parse_path("root['joe'].age", include_actions=True) + [{'element': 'joe', 'action': 'GET'}, {'element': 'age', 'action': 'GETATTR'}] - Perhaps in future we can have a report key that is item duplicated and modified instead of just added. - """ - cache_key = None - if self._stats[DISTANCE_CACHE_ENABLED]: - cache_key = combine_hashes_lists(items=[hashes_added, hashes_removed], prefix='pairs_cache') - if cache_key in self._distance_cache: - return self._distance_cache.get(cache_key).copy() - - # A dictionary of hashes to distances and each distance to an ordered set of hashes. - # It tells us about the distance of each object from other objects. - # And the objects with the same distances are grouped together in an ordered set. - # It also includes a "max" key that is just the value of the biggest current distance in the - # most_in_common_pairs dictionary. - def defaultdict_orderedset(): - return defaultdict(SetOrdered) - most_in_common_pairs = defaultdict(defaultdict_orderedset) - pairs = dict_() - - pre_calced_distances = None - if hashes_added and hashes_removed and np and len(hashes_added) > 1 and len(hashes_removed) > 1: - # pre-calculates distances ONLY for 1D arrays whether an _original_type - # was explicitly passed or a homogeneous array is detected. - # Numpy is needed for this optimization. - pre_calced_distances = self._precalculate_numpy_arrays_distance( - hashes_added, hashes_removed, t1_hashtable, t2_hashtable, _original_type) - - if hashes_added and hashes_removed \ - and self.iterable_compare_func \ - and len(hashes_added) > 0 and len(hashes_removed) > 0: - pre_calced_distances = self._precalculate_distance_by_custom_compare_func( - hashes_added, hashes_removed, t1_hashtable, t2_hashtable, _original_type) - - for added_hash in hashes_added: - for removed_hash in hashes_removed: - added_hash_obj = t2_hashtable[added_hash] - removed_hash_obj = t1_hashtable[removed_hash] - - # Loop is detected - if id(removed_hash_obj.item) in parents_ids: - continue - - _distance = None - if pre_calced_distances: - _distance = pre_calced_distances.get("{}--{}".format(added_hash, removed_hash)) - if _distance is None: - _distance = self._get_rough_distance_of_hashed_objs( - added_hash, removed_hash, added_hash_obj, removed_hash_obj, _original_type) - # Left for future debugging - # print(f'{Fore.RED}distance of {added_hash_obj.item} and {removed_hash_obj.item}: {_distance}{Style.RESET_ALL}') - # Discard potential pairs that are too far. - if _distance >= self.cutoff_distance_for_pairs: - continue - pairs_of_item = most_in_common_pairs[added_hash] - pairs_of_item[_distance].add(removed_hash) - used_to_hashes = set() - - distances_to_from_hashes = defaultdict(SetOrdered) - for from_hash, distances_to_to_hashes in most_in_common_pairs.items(): - # del distances_to_to_hashes['max'] - for dist in distances_to_to_hashes: - distances_to_from_hashes[dist].add(from_hash) - - for dist in sorted(distances_to_from_hashes.keys()): - from_hashes = distances_to_from_hashes[dist] - while from_hashes: - from_hash = from_hashes.pop() - if from_hash not in used_to_hashes: - to_hashes = most_in_common_pairs[from_hash][dist] - while to_hashes: - to_hash = to_hashes.pop() - if to_hash not in used_to_hashes: - used_to_hashes.add(from_hash) - used_to_hashes.add(to_hash) - # Left for future debugging: - # print(f'{bcolors.FAIL}Adding {t2_hashtable[from_hash].item} as a pairs of {t1_hashtable[to_hash].item} with distance of {dist}{bcolors.ENDC}') - pairs[from_hash] = to_hash - - inverse_pairs = {v: k for k, v in pairs.items()} - pairs.update(inverse_pairs) - if cache_key and self._stats[DISTANCE_CACHE_ENABLED]: - self._distance_cache.set(cache_key, value=pairs) - return pairs.copy() - - def _diff_iterable_with_deephash(self, level, parents_ids, _original_type=None, local_tree=None): - """Diff of hashable or unhashable iterables. Only used when ignoring the order.""" - - full_t1_hashtable = self._create_hashtable(level, 't1') - full_t2_hashtable = self._create_hashtable(level, 't2') - t1_hashes = SetOrdered(full_t1_hashtable.keys()) - t2_hashes = SetOrdered(full_t2_hashtable.keys()) - hashes_added = t2_hashes - t1_hashes - hashes_removed = t1_hashes - t2_hashes - - # Deciding whether to calculate pairs or not. - if (len(hashes_added) + len(hashes_removed)) / (len(full_t1_hashtable) + len(full_t2_hashtable) + 1) > self.cutoff_intersection_for_pairs: - get_pairs = False - else: - get_pairs = True + """ - # reduce the size of hashtables - if self.report_repetition: - t1_hashtable = full_t1_hashtable - t2_hashtable = full_t2_hashtable - else: - t1_hashtable = {k: v for k, v in full_t1_hashtable.items() if k in hashes_removed} - t2_hashtable = {k: v for k, v in full_t2_hashtable.items() if k in hashes_added} - if self._stats[PASSES_COUNT] < self.max_passes and get_pairs: - self._stats[PASSES_COUNT] += 1 - pairs = self._get_most_in_common_pairs_in_iterables( - hashes_added, hashes_removed, t1_hashtable, t2_hashtable, parents_ids, _original_type) - elif get_pairs: - if not self._stats[MAX_PASS_LIMIT_REACHED]: - self._stats[MAX_PASS_LIMIT_REACHED] = True - logger.warning(MAX_PASSES_REACHED_MSG.format(self.max_passes)) - pairs = dict_() - else: - pairs = dict_() - - def get_other_pair(hash_value, in_t1=True): - """ - Gets the other paired indexed hash item to the hash_value in the pairs dictionary - in_t1: are we looking for the other pair in t1 or t2? - """ - if in_t1: - hashtable = t1_hashtable - the_other_hashes = hashes_removed - else: - hashtable = t2_hashtable - the_other_hashes = hashes_added - other = pairs.pop(hash_value, notpresent) - if other is notpresent: - other = notpresent_indexed - else: - # The pairs are symmetrical. - # removing the other direction of pair - # so it does not get used. - del pairs[other] - the_other_hashes.remove(other) - other = hashtable[other] - return other - - if self.report_repetition: - for hash_value in hashes_added: - if self._count_diff() is StopIteration: - return # pragma: no cover. This is already covered for addition (when report_repetition=False). - other = get_other_pair(hash_value) - item_id = id(other.item) - indexes = t2_hashtable[hash_value].indexes if other.item is notpresent else other.indexes - # When we report repetitions, we want the child_relationship_param2 only if there is no repetition. - # Because when there is a repetition, we report it in a different way (iterable_items_added_at_indexes for example). - # When there is no repetition, we want child_relationship_param2 so that we report the "new_path" correctly. - if len(t2_hashtable[hash_value].indexes) == 1: - index2 = t2_hashtable[hash_value].indexes[0] - else: - index2 = None - for i in indexes: - change_level = level.branch_deeper( - other.item, - t2_hashtable[hash_value].item, - child_relationship_class=SubscriptableIterableRelationship, - child_relationship_param=i, - child_relationship_param2=index2, - ) - if other.item is notpresent: - self._report_result('iterable_item_added', change_level, local_tree=local_tree) - else: - parents_ids_added = add_to_frozen_set(parents_ids, item_id) - self._diff(change_level, parents_ids_added, local_tree=local_tree) - for hash_value in hashes_removed: - if self._count_diff() is StopIteration: - return # pragma: no cover. This is already covered for addition. - other = get_other_pair(hash_value, in_t1=False) - item_id = id(other.item) - # When we report repetitions, we want the child_relationship_param2 only if there is no repetition. - # Because when there is a repetition, we report it in a different way (iterable_items_added_at_indexes for example). - # When there is no repetition, we want child_relationship_param2 so that we report the "new_path" correctly. - if other.item is notpresent or len(other.indexes > 1): - index2 = None - else: - index2 = other.indexes[0] - for i in t1_hashtable[hash_value].indexes: - change_level = level.branch_deeper( - t1_hashtable[hash_value].item, - other.item, - child_relationship_class=SubscriptableIterableRelationship, - child_relationship_param=i, - child_relationship_param2=index2, - ) - if other.item is notpresent: - self._report_result('iterable_item_removed', change_level, local_tree=local_tree) - else: - # I was not able to make a test case for the following 2 lines since the cases end up - # getting resolved above in the hashes_added calcs. However I am leaving these 2 lines - # in case things change in future. - parents_ids_added = add_to_frozen_set(parents_ids, item_id) # pragma: no cover. - self._diff(change_level, parents_ids_added, local_tree=local_tree) # pragma: no cover. - - items_intersect = t2_hashes.intersection(t1_hashes) - - for hash_value in items_intersect: - t1_indexes = t1_hashtable[hash_value].indexes - t2_indexes = t2_hashtable[hash_value].indexes - t1_indexes_len = len(t1_indexes) - t2_indexes_len = len(t2_indexes) - if t1_indexes_len != t2_indexes_len: # this is a repetition change! - # create "change" entry, keep current level untouched to handle further changes - repetition_change_level = level.branch_deeper( - t1_hashtable[hash_value].item, - t2_hashtable[hash_value].item, # nb: those are equal! - child_relationship_class=SubscriptableIterableRelationship, - child_relationship_param=t1_hashtable[hash_value] - .indexes[0]) - repetition_change_level.additional['repetition'] = RemapDict( - old_repeat=t1_indexes_len, - new_repeat=t2_indexes_len, - old_indexes=t1_indexes, - new_indexes=t2_indexes) - self._report_result('repetition_change', - repetition_change_level, local_tree=local_tree) + result = _path_to_elements(path, root_element=root_element) + result = iter(result) + if root_element: + next(result) # We don't want the root item + if include_actions is False: + return [i[0] for i in result] + return [{'element': i[0], 'action': i[1]} for i in result] + + +def stringify_element(param, quote_str=None): + has_quote = "'" in param + has_double_quote = '"' in param + if has_quote and has_double_quote and not quote_str: + new_param = [] + for char in param: + if char in {'"', "'"}: + new_param.append('𝆺𝅥𝅯') + new_param.append(char) + result = '"' + ''.join(new_param) + '"' + elif has_quote: + result = f'"{param}"' + elif has_double_quote: + result = f"'{param}'" + else: + result = param if quote_str is None else quote_str.format(param) + return result + + +def stringify_path(path, root_element=DEFAULT_FIRST_ELEMENT, quote_str="'{}'"): + """ + Gets the path as an string. + For example [1, 2, 'age'] should become + root[1][2]['age'] + """ + if not path: + return root_element[0] + result = [root_element[0]] + has_actions = False + try: + if path[0][1] in {GET, GETATTR}: + has_actions = True + except (KeyError, IndexError, TypeError): + pass + if not has_actions: + path = [(i, GET) for i in path] + path[0] = (path[0][0], root_element[1]) # The action for the first element might be a GET or GETATTR. We update the action based on the root_element. + for element, action in path: + if isinstance(element, str) and action == GET: + element = stringify_element(element, quote_str) + if action == GET: + result.append(f"[{element}]") else: - for hash_value in hashes_added: - if self._count_diff() is StopIteration: - return - other = get_other_pair(hash_value) - item_id = id(other.item) - index = t2_hashtable[hash_value].indexes[0] if other.item is notpresent else other.indexes[0] - index2 = t2_hashtable[hash_value].indexes[0] - change_level = level.branch_deeper( - other.item, - t2_hashtable[hash_value].item, - child_relationship_class=SubscriptableIterableRelationship, - child_relationship_param=index, - child_relationship_param2=index2, - ) - if other.item is notpresent: - self._report_result('iterable_item_added', change_level, local_tree=local_tree) - else: - parents_ids_added = add_to_frozen_set(parents_ids, item_id) - self._diff(change_level, parents_ids_added, local_tree=local_tree) - - for hash_value in hashes_removed: - if self._count_diff() is StopIteration: - return # pragma: no cover. This is already covered for addition. - other = get_other_pair(hash_value, in_t1=False) - item_id = id(other.item) - index = t1_hashtable[hash_value].indexes[0] - index2 = t1_hashtable[hash_value].indexes[0] if other.item is notpresent else other.indexes[0] - change_level = level.branch_deeper( - t1_hashtable[hash_value].item, - other.item, - child_relationship_class=SubscriptableIterableRelationship, - child_relationship_param=index, - child_relationship_param2=index2, - ) - if other.item is notpresent: - self._report_result('iterable_item_removed', change_level, local_tree=local_tree) - else: - # Just like the case when report_repetition = True, these lines never run currently. - # However they will stay here in case things change in future. - parents_ids_added = add_to_frozen_set(parents_ids, item_id) # pragma: no cover. - self._diff(change_level, parents_ids_added, local_tree=local_tree) # pragma: no cover. - - def _diff_booleans(self, level, local_tree=None): - if level.t1 != level.t2: - self._report_result('values_changed', level, local_tree=local_tree) - - def _diff_numbers(self, level, local_tree=None, report_type_change=True): - """Diff Numbers""" - if report_type_change: - t1_type = "number" if self.ignore_numeric_type_changes else level.t1.__class__.__name__ - t2_type = "number" if self.ignore_numeric_type_changes else level.t2.__class__.__name__ - else: - t1_type = t2_type = '' - - if self.use_log_scale: - if not logarithmic_similarity(level.t1, level.t2, threshold=self.log_scale_similarity_threshold): - self._report_result('values_changed', level, local_tree=local_tree) - elif self.math_epsilon is not None: - if not is_close(level.t1, level.t2, abs_tol=self.math_epsilon): - self._report_result('values_changed', level, local_tree=local_tree) - elif self.significant_digits is None: - if level.t1 != level.t2: - self._report_result('values_changed', level, local_tree=local_tree) - else: - # Bernhard10: I use string formatting for comparison, to be consistent with usecases where - # data is read from files that were previously written from python and - # to be consistent with on-screen representation of numbers. - # Other options would be abs(t1-t2)<10**-self.significant_digits - # or math.is_close (python3.5+) - # Note that abs(3.25-3.251) = 0.0009999999999998899 < 0.001 - # Note also that "{:.3f}".format(1.1135) = 1.113, but "{:.3f}".format(1.11351) = 1.114 - # For Decimals, format seems to round 2.5 to 2 and 3.5 to 4 (to closest even number) - t1_s = self.number_to_string(level.t1, - significant_digits=self.significant_digits, - number_format_notation=self.number_format_notation) # type: ignore - t2_s = self.number_to_string(level.t2, - significant_digits=self.significant_digits, - number_format_notation=self.number_format_notation) # type: ignore - - t1_s = KEY_TO_VAL_STR.format(t1_type, t1_s) - t2_s = KEY_TO_VAL_STR.format(t2_type, t2_s) - if t1_s != t2_s: - self._report_result('values_changed', level, local_tree=local_tree) - - def _diff_ipranges(self, level, local_tree=None): - """Diff IP ranges""" - if str(level.t1) != str(level.t2): - self._report_result('values_changed', level, local_tree=local_tree) - - def _diff_datetime(self, level, local_tree=None): - """Diff DateTimes""" - level.t1 = datetime_normalize(self.truncate_datetime, level.t1, default_timezone=self.default_timezone) - level.t2 = datetime_normalize(self.truncate_datetime, level.t2, default_timezone=self.default_timezone) - - if level.t1 != level.t2: - self._report_result('values_changed', level, local_tree=local_tree) - - def _diff_time(self, level, local_tree=None): - """Diff DateTimes""" - if self.truncate_datetime: - level.t1 = datetime_normalize(self.truncate_datetime, level.t1, default_timezone=self.default_timezone) - level.t2 = datetime_normalize(self.truncate_datetime, level.t2, default_timezone=self.default_timezone) - - if level.t1 != level.t2: - self._report_result('values_changed', level, local_tree=local_tree) - - def _diff_uuids(self, level, local_tree=None): - """Diff UUIDs""" - if level.t1.int != level.t2.int: - self._report_result('values_changed', level, local_tree=local_tree) - - def _diff_numpy_array(self, level, parents_ids=frozenset(), local_tree=None): - """Diff numpy arrays""" - if level.path() not in self._numpy_paths: - self._numpy_paths[level.path()] = get_type(level.t2).__name__ - if np is None: - # This line should never be run. If it is ever called means the type check detected a numpy array - # which means numpy module needs to be available. So np can't be None. - raise ImportError(CANT_FIND_NUMPY_MSG) # pragma: no cover - - if (self.ignore_order_func and not self.ignore_order_func(level)) or not self.ignore_order: - # fast checks - if self.significant_digits is None: - if np.array_equal(level.t1, level.t2, equal_nan=self.ignore_nan_inequality): - return # all good - else: - try: - np.testing.assert_almost_equal(level.t1, level.t2, decimal=self.significant_digits) - except TypeError: - np.array_equal(level.t1, level.t2, equal_nan=self.ignore_nan_inequality) - except AssertionError: - pass # do detailed checking below - else: - return # all good - - # compare array meta-data - _original_type = level.t1.dtype - if level.t1.shape != level.t2.shape: - # arrays are converted to python lists so that certain features of DeepDiff can apply on them easier. - # They will be converted back to Numpy at their final dimension. - level.t1 = level.t1.tolist() - level.t2 = level.t2.tolist() - self._diff_iterable(level, parents_ids, _original_type=_original_type, local_tree=local_tree) - else: - # metadata same -- the difference is in the content - shape = level.t1.shape - dimensions = len(shape) - if dimensions == 1: - self._diff_iterable(level, parents_ids, _original_type=_original_type, local_tree=local_tree) - elif (self.ignore_order_func and self.ignore_order_func(level)) or self.ignore_order: - # arrays are converted to python lists so that certain features of DeepDiff can apply on them easier. - # They will be converted back to Numpy at their final dimension. - level.t1 = level.t1.tolist() - level.t2 = level.t2.tolist() - self._diff_iterable_with_deephash(level, parents_ids, _original_type=_original_type, local_tree=local_tree) - else: - for (t1_path, t1_row), (t2_path, t2_row) in zip( - get_numpy_ndarray_rows(level.t1, shape), - get_numpy_ndarray_rows(level.t2, shape)): - - new_level = level.branch_deeper( - t1_row, - t2_row, - child_relationship_class=NumpyArrayRelationship, - child_relationship_param=t1_path, - child_relationship_param2=t2_path, - ) - - self._diff_iterable_in_order(new_level, parents_ids, _original_type=_original_type, local_tree=local_tree) - - def _diff_types(self, level, local_tree=None): - """Diff types""" - level.report_type = 'type_changes' - self._report_result('type_changes', level, local_tree=local_tree) - - def _count_diff(self): - if (self.max_diffs is not None and self._stats[DIFF_COUNT] > self.max_diffs): - if not self._stats[MAX_DIFF_LIMIT_REACHED]: - self._stats[MAX_DIFF_LIMIT_REACHED] = True - logger.warning(MAX_DIFFS_REACHED_MSG.format(self.max_diffs)) - return StopIteration - self._stats[DIFF_COUNT] += 1 - if self.cache_size and self.cache_tuning_sample_size: - self._auto_tune_cache() - - def _auto_tune_cache(self): - take_sample = (self._stats[DIFF_COUNT] % self.cache_tuning_sample_size == 0) - if self.cache_tuning_sample_size: - if self._stats[DISTANCE_CACHE_ENABLED]: - if take_sample: - self._auto_off_cache() - # Turn on the cache once in a while - elif self._stats[DIFF_COUNT] % self._shared_parameters[_ENABLE_CACHE_EVERY_X_DIFF] == 0: - self.progress_logger('Re-enabling the distance and level caches.') - # decreasing the sampling frequency - self._shared_parameters[_ENABLE_CACHE_EVERY_X_DIFF] *= 10 - self._stats[DISTANCE_CACHE_ENABLED] = True - if take_sample: - for key in (PREVIOUS_DIFF_COUNT, PREVIOUS_DISTANCE_CACHE_HIT_COUNT): - self._stats[key] = self._stats[key[9:]] - - def _auto_off_cache(self): - """ - Auto adjust the cache based on the usage - """ - if self._stats[DISTANCE_CACHE_ENABLED]: - angle = (self._stats[DISTANCE_CACHE_HIT_COUNT] - self._stats['PREVIOUS {}'.format(DISTANCE_CACHE_HIT_COUNT)]) / (self._stats[DIFF_COUNT] - self._stats[PREVIOUS_DIFF_COUNT]) - if angle < self.CACHE_AUTO_ADJUST_THRESHOLD: - self._stats[DISTANCE_CACHE_ENABLED] = False - self.progress_logger('Due to minimal cache hits, {} is disabled.'.format('distance cache')) - - def _use_custom_operator(self, level): - """ - For each level we check all custom operators. - If any one of them was a match for the level, we run the diff of the operator. - If the operator returned True, the operator must have decided these objects should not - be compared anymore. It might have already reported their results. - In that case the report will appear in the final results of this diff. - Otherwise basically the 2 objects in the level are being omitted from the results. - """ + result.append(f".{element}") + return ''.join(result) - for operator in self.custom_operators: - if operator.match(level): - prevent_default = operator.give_up_diffing(level=level, diff_instance=self) - if prevent_default: - return True - return False +# Regex to detect wildcard segments in a raw path string. +# Matches [*], [**], .*, .** that are NOT inside quotes. +_WILDCARD_RE = re.compile( + r'\[\*\*?\]' # [*] or [**] + r'|\.\*\*?(?=[.\[]|$)' # .* or .** followed by . or [ or end of string +) - def _diff(self, level, parents_ids=frozenset(), _original_type=None, local_tree=None): - """ - The main diff method - **parameters** +def path_has_wildcard(path): + """Check if a path string contains wildcard segments (* or **).""" + return bool(_WILDCARD_RE.search(path)) - level: the tree level or tree node - parents_ids: the ids of all the parent objects in the tree from the current node. - _original_type: If the objects had an original type that was different than what currently exists in the level.t1 and t2 - """ - if self._count_diff() is StopIteration: - return - if self._use_custom_operator(level): - return +class GlobPathMatcher: + """Pre-compiled matcher for a single glob pattern path. - if level.t1 is level.t2: - return + Parses a pattern like ``root['users'][*]['password']`` into segments + and matches concrete path strings against it. - if self._skip_this(level): - return + ``*`` matches exactly one path segment (any key, index, or attribute). + ``**`` matches zero or more path segments. + """ - report_type_change = True - if get_type(level.t1) != get_type(level.t2): - for type_group in self.ignore_type_in_groups: - if self.type_check_func(level.t1, type_group) and self.type_check_func(level.t2, type_group): - report_type_change = False - break - if self.use_enum_value and isinstance(level.t1, Enum): - level.t1 = level.t1.value - report_type_change = False - if self.use_enum_value and isinstance(level.t2, Enum): - level.t2 = level.t2.value - report_type_change = False - if report_type_change: - self._diff_types(level, local_tree=local_tree) - return - # This is an edge case where t1=None or t2=None and None is in the ignore type group. - if level.t1 is None or level.t2 is None: - self._report_result('values_changed', level, local_tree=local_tree) - return - - if self.ignore_nan_inequality and isinstance(level.t1, (float, np_floating)) and str(level.t1) == str(level.t2) == 'nan': - return + def __init__(self, pattern_path): + self.original_pattern = pattern_path + elements = _path_to_elements(pattern_path, root_element=('root', GETATTR)) + # Skip the root element for matching + self._pattern = elements[1:] + + def match(self, path_string): + """Return True if *path_string* matches this pattern exactly.""" + elements = _path_to_elements(path_string, root_element=('root', GETATTR)) + target = elements[1:] + return self._match_segments(self._pattern, target, 0, 0) + + def match_or_is_ancestor(self, path_string): + """Return True if *path_string* matches OR is an ancestor of a potential match. + + This is needed for ``include_paths``: we must not prune a path that + could lead to a matching descendant. + """ + elements = _path_to_elements(path_string, root_element=('root', GETATTR)) + target = elements[1:] + return (self._match_segments(self._pattern, target, 0, 0) or + self._could_match_descendant(self._pattern, target, 0, 0)) + + def match_or_is_descendant(self, path_string): + """Return True if *path_string* matches OR is a descendant of a matching path. + + This checks whether the pattern matches any prefix of *path_string*, + meaning the path is "inside" a matched subtree. + """ + elements = _path_to_elements(path_string, root_element=('root', GETATTR)) + target = elements[1:] + # Check exact match first + if self._match_segments(self._pattern, target, 0, 0): + return True + # Check if any prefix of target matches (making this path a descendant) + for length in range(len(target)): + if self._match_segments(self._pattern, target[:length], 0, 0): + return True + return False - if isinstance(level.t1, booleans): - self._diff_booleans(level, local_tree=local_tree) - - elif isinstance(level.t1, strings): - # Special handling when comparing string with UUID and ignore_uuid_types is True - if self.ignore_uuid_types and isinstance(level.t2, uuids): - try: - # Convert string to UUID for comparison - t1_uuid = uuid.UUID(level.t1) - if t1_uuid.int != level.t2.int: - self._report_result('values_changed', level, local_tree=local_tree) - except (ValueError, AttributeError): - # If string is not a valid UUID, report as changed - self._report_result('values_changed', level, local_tree=local_tree) - else: - self._diff_str(level, local_tree=local_tree) - - elif isinstance(level.t1, datetime.datetime): - self._diff_datetime(level, local_tree=local_tree) - - elif isinstance(level.t1, ipranges): - self._diff_ipranges(level, local_tree=local_tree) - - elif isinstance(level.t1, (datetime.date, datetime.timedelta, datetime.time)): - self._diff_time(level, local_tree=local_tree) - - elif isinstance(level.t1, uuids): - # Special handling when comparing UUID with string and ignore_uuid_types is True - if self.ignore_uuid_types and isinstance(level.t2, str): - try: - # Convert string to UUID for comparison - t2_uuid = uuid.UUID(level.t2) - if level.t1.int != t2_uuid.int: - self._report_result('values_changed', level, local_tree=local_tree) - except (ValueError, AttributeError): - # If string is not a valid UUID, report as changed - self._report_result('values_changed', level, local_tree=local_tree) + @staticmethod + def _match_segments(pattern, target, pi, ti): + """Recursive segment matcher with backtracking for ``**``.""" + while pi < len(pattern) and ti < len(target): + pat_elem = pattern[pi][0] + + if pat_elem == MULTI_WILDCARD: + # ** matches zero or more segments — try every suffix + for k in range(ti, len(target) + 1): + if GlobPathMatcher._match_segments(pattern, target, pi + 1, k): + return True + return False + elif pat_elem == SINGLE_WILDCARD: + # * matches exactly one segment regardless of value/action + pi += 1 + ti += 1 else: - self._diff_uuids(level, local_tree=local_tree) - - elif isinstance(level.t1, numbers): - self._diff_numbers(level, local_tree=local_tree, report_type_change=report_type_change) - - elif isinstance(level.t1, Mapping): - self._diff_dict(level, parents_ids, local_tree=local_tree) - - elif isinstance(level.t1, tuple): - self._diff_tuple(level, parents_ids, local_tree=local_tree) - - elif isinstance(level.t1, (set, frozenset, SetOrdered)): - self._diff_set(level, local_tree=local_tree) + tgt_elem = target[ti][0] + if pat_elem != tgt_elem: + return False + pi += 1 + ti += 1 - elif isinstance(level.t1, np_ndarray): - self._diff_numpy_array(level, parents_ids, local_tree=local_tree) + # Consume any trailing ** (they can match zero segments) + while pi < len(pattern) and pattern[pi][0] == MULTI_WILDCARD: + pi += 1 - elif isinstance(level.t1, PydanticBaseModel): - self._diff_obj(level, parents_ids, local_tree=local_tree, is_pydantic_object=True) + return pi == len(pattern) and ti == len(target) - elif isinstance(level.t1, Iterable): - self._diff_iterable(level, parents_ids, _original_type=_original_type, local_tree=local_tree) + @staticmethod + def _could_match_descendant(pattern, target, pi, ti): + """Check if *target* is a prefix that could lead to a match deeper down.""" + if ti == len(target): + # Target exhausted — it's an ancestor if pattern has remaining segments + return pi < len(pattern) - elif isinstance(level.t1, Enum): - self._diff_enum(level, parents_ids, local_tree=local_tree) + if pi >= len(pattern): + return False - else: - self._diff_obj(level, parents_ids) + pat_elem = pattern[pi][0] - def _get_view_results(self, view, verbose_level=None): - """ - Get the results based on the view - """ - result = self.tree - if not self.report_repetition: # and self.is_root: - result.mutual_add_removes_to_become_value_changes() - if view == TREE_VIEW: - pass - elif view == TEXT_VIEW: - effective_verbose_level = verbose_level if verbose_level is not None else self.verbose_level - result = TextResult(tree_results=self.tree, verbose_level=effective_verbose_level) - result.remove_empty_keys() - elif view == DELTA_VIEW: - result = self._to_delta_dict(report_repetition_required=False) - elif view == COLORED_VIEW: - result = ColoredView(t2=self.t2, tree_result=self.tree, compact=False) - elif view == COLORED_COMPACT_VIEW: - result = ColoredView(t2=self.t2, tree_result=self.tree, compact=True) + if pat_elem == MULTI_WILDCARD: + return (GlobPathMatcher._could_match_descendant(pattern, target, pi + 1, ti) or + GlobPathMatcher._could_match_descendant(pattern, target, pi, ti + 1)) + elif pat_elem == SINGLE_WILDCARD: + return GlobPathMatcher._could_match_descendant(pattern, target, pi + 1, ti + 1) else: - raise ValueError(INVALID_VIEW_MSG.format(view)) - return result - - @staticmethod - def _get_key_for_group_by(row, group_by, item_name): - """ - Get the key value to group a row by, using the specified group_by parameter. - - Example - >>> row = {'first': 'John', 'middle': 'Joe', 'last': 'Smith'} - >>> DeepDiff._get_key_for_group_by(row, 'first', 't1') - 'John' - >>> nested_row = {'id': 123, 'demographics': {'names': {'first': 'John', 'middle': 'Joe', 'last': 'Smith'}}} - >>> group_by = lambda x: x['demographics']['names']['first'] - >>> DeepDiff._get_key_for_group_by(nested_row, group_by, 't1') - 'John' - - Args: - row (dict): The dictionary (row) to extract the group by key from. - group_by (str or callable): The key name or function to call to get to the key value to group by. - item_name (str): The name of the item, used for error messages. - - Returns: - str: The key value to group by. - - Raises: - KeyError: If the specified key is not found in the row. - """ - try: - if callable(group_by): - return group_by(row) - return row.pop(group_by) - except KeyError: - logger.error("Unable to group {} by {}. The key is missing in {}".format(item_name, group_by, row)) - raise - - def _group_iterable_to_dict(self, item, group_by, item_name): - """ - Convert a list of dictionaries into a dictionary of dictionaries - where the key is the value of the group_by key in each dictionary. - """ - group_by_level2 = None - if isinstance(group_by, (list, tuple)): - group_by_level1 = group_by[0] - if len(group_by) > 1: - group_by_level2 = group_by[1] - else: - group_by_level1 = group_by - if isinstance(item, Iterable) and not isinstance(item, Mapping): - result = {} - item_copy = deepcopy(item) - for row in item_copy: - if isinstance(row, Mapping): - key1 = self._get_key_for_group_by(row, group_by_level1, item_name) - # Track keys created by group_by to avoid type prefixing later - if hasattr(self, 'group_by_keys'): - self.group_by_keys.add(key1) - if group_by_level2: - key2 = self._get_key_for_group_by(row, group_by_level2, item_name) - # Track level 2 keys as well - if hasattr(self, 'group_by_keys'): - self.group_by_keys.add(key2) - if key1 not in result: - result[key1] = {} - if self.group_by_sort_key: - if key2 not in result[key1]: - result[key1][key2] = [] - result_key1_key2 = result[key1][key2] - if row not in result_key1_key2: - result_key1_key2.append(row) - else: - result[key1][key2] = row - else: - if self.group_by_sort_key: - if key1 not in result: - result[key1] = [] - if row not in result[key1]: - result[key1].append(row) - else: - result[key1] = row - else: - msg = "Unable to group {} by {} since the item {} is not a dictionary.".format(item_name, group_by_level1, row) - logger.error(msg) - raise ValueError(msg) - if self.group_by_sort_key: - if group_by_level2: - for key1, row1 in result.items(): - for key2, row in row1.items(): - row.sort(key=self.group_by_sort_key) - else: - for key, row in result.items(): - row.sort(key=self.group_by_sort_key) - return result - msg = "Unable to group {} by {}".format(item_name, group_by) - logger.error(msg) - raise ValueError(msg) - - def get_stats(self): - """ - Get some stats on internals of the DeepDiff run. - """ - return self._stats + tgt_elem = target[ti][0] + if pat_elem != tgt_elem: + return False + return GlobPathMatcher._could_match_descendant(pattern, target, pi + 1, ti + 1) - @property - def affected_paths(self): - """ - Get the list of paths that were affected. - Whether a value was changed or they were added or removed. - - Example - >>> from pprint import pprint - >>> t1 = {1: 1, 2: 2, 3: [3], 4: 4} - >>> t2 = {1: 1, 2: 4, 3: [3, 4], 5: 5, 6: 6} - >>> ddiff = DeepDiff(t1, t2) - >>> pprint(ddiff, indent=4) - { 'dictionary_item_added': ['root[5]', 'root[6]'], - 'dictionary_item_removed': ['root[4]'], - 'iterable_item_added': {'root[3][1]': 4}, - 'values_changed': {'root[2]': {'new_value': 4, 'old_value': 2}}} - >>> sorted(ddiff.affected_paths) - ['root[2]', 'root[3][1]', 'root[4]', 'root[5]', 'root[6]'] - >>> sorted(ddiff.affected_root_keys) - [2, 3, 4, 5, 6] - """ - result = SetOrdered() - for key in REPORT_KEYS: - value = self.get(key) - if value: - if isinstance(value, SetOrdered): - result |= value - else: - result |= SetOrdered(value.keys()) - return result +def compile_glob_paths(paths): + """Compile a list of glob pattern strings into GlobPathMatcher objects. - @property - def affected_root_keys(self): - """ - Get the list of root keys that were affected. - Whether a value was changed or they were added or removed. - - Example - >>> from pprint import pprint - >>> t1 = {1: 1, 2: 2, 3: [3], 4: 4} - >>> t2 = {1: 1, 2: 4, 3: [3, 4], 5: 5, 6: 6} - >>> ddiff = DeepDiff(t1, t2) - >>> pprint(ddiff, indent=4) - { 'dictionary_item_added': ['root[5]', 'root[6]'], - 'dictionary_item_removed': ['root[4]'], - 'iterable_item_added': {'root[3][1]': 4}, - 'values_changed': {'root[2]': {'new_value': 4, 'old_value': 2}}} - >>> sorted(ddiff.affected_paths) - ['root[2]', 'root[3][1]', 'root[4]', 'root[5]', 'root[6]'] - >>> sorted(ddiff.affected_root_keys) - [2, 3, 4, 5, 6] - """ - result = SetOrdered() - for key in REPORT_KEYS: - value = self.tree.get(key) - if value: - if isinstance(value, SetOrdered): - values_list = value - else: - values_list = value.keys() - for item in values_list: - root_key = item.get_root_key() - if root_key is not notpresent: - result.add(root_key) - return result - - def __str__(self): - if hasattr(self, '_colored_view') and self.view in {COLORED_VIEW, COLORED_COMPACT_VIEW}: - return str(self._colored_view) - return super().__str__() - - -if __name__ == "__main__": # pragma: no cover - import doctest - doctest.testmod() + Returns a list of ``GlobPathMatcher`` or ``None`` if *paths* is empty/None. + """ + if not paths: + return None + return [GlobPathMatcher(p) for p in paths] diff --git a/deepdiff/helper.py b/deepdiff/helper.py index cb382afd..f7eeea24 100644 --- a/deepdiff/helper.py +++ b/deepdiff/helper.py @@ -352,6 +352,30 @@ def add_root_to_paths(paths: Optional[Iterable[str]]) -> Optional[SetOrdered]: return result +def separate_wildcard_and_exact_paths(paths): + """Separate a set of paths into exact paths and wildcard pattern paths. + + Returns ``(exact_set_or_none, wildcard_list_or_none)``. + Wildcard paths must start with ``root``; a ``ValueError`` is raised otherwise. + """ + if not paths: + return None, None + from deepdiff.path import path_has_wildcard, compile_glob_paths + exact = set() + wildcards = [] + for path in paths: + if path_has_wildcard(path): + if not path.startswith('root'): + raise ValueError( + "Wildcard paths must start with 'root'. Got: {}".format(path)) + wildcards.append(path) + else: + exact.add(path) + exact_result = exact if exact else None + glob_result = compile_glob_paths(wildcards) if wildcards else None + return exact_result, glob_result + + RE_COMPILED_TYPE = type(re.compile('')) diff --git a/deepdiff/path.py b/deepdiff/path.py index e5b64c70..2ac62b5e 100644 --- a/deepdiff/path.py +++ b/deepdiff/path.py @@ -1,3 +1,4 @@ +import re import logging from ast import literal_eval from functools import lru_cache @@ -8,6 +9,30 @@ GET = 'GET' +class _WildcardToken: + """Sentinel object for wildcard path tokens. + + Using a dedicated class (instead of plain strings) ensures that a literal + dict key ``'*'`` (parsed from ``root['*']``) is never confused with the + wildcard ``*`` (parsed from ``root[*]``). + """ + def __init__(self, symbol): + self._symbol = symbol + + def __repr__(self): + return self._symbol + + def __eq__(self, other): + return isinstance(other, _WildcardToken) and self._symbol == other._symbol + + def __hash__(self): + return hash(('_WildcardToken', self._symbol)) + + +SINGLE_WILDCARD = _WildcardToken('*') +MULTI_WILDCARD = _WildcardToken('**') + + class PathExtractionError(ValueError): pass @@ -21,6 +46,16 @@ def _add_to_elements(elements, elem, inside): if not elem: return if not elem.startswith('__'): + # Handle wildcard tokens (* and **) as-is. + # Unquoted root[*] arrives as bare '*' which matches the string check. + # Quoted root['*'] arrives as "'*'" which does NOT match, so it falls + # through to literal_eval and becomes the plain string '*' — which is + # distinct from the _WildcardToken sentinel and thus treated as a + # literal dict key. + if elem in ('*', '**'): + action = GETATTR if inside == '.' else GET + elements.append((SINGLE_WILDCARD if elem == '*' else MULTI_WILDCARD, action)) + return remove_quotes = False if '𝆺𝅥𝅯' in elem or '\\' in elem: remove_quotes = True @@ -321,3 +356,129 @@ def stringify_path(path, root_element=DEFAULT_FIRST_ELEMENT, quote_str="'{}'"): else: result.append(f".{element}") return ''.join(result) + + +# Regex to detect wildcard segments in a raw path string. +# Matches [*], [**], .*, .** that are NOT inside quotes. +_WILDCARD_RE = re.compile( + r'\[\*\*?\]' # [*] or [**] + r'|\.\*\*?(?=[.\[]|$)' # .* or .** followed by . or [ or end of string +) + + +def path_has_wildcard(path): + """Check if a path string contains wildcard segments (* or **).""" + return bool(_WILDCARD_RE.search(path)) + + +class GlobPathMatcher: + """Pre-compiled matcher for a single glob pattern path. + + Parses a pattern like ``root['users'][*]['password']`` into segments + and matches concrete path strings against it. + + ``*`` matches exactly one path segment (any key, index, or attribute). + ``**`` matches zero or more path segments. + """ + + def __init__(self, pattern_path): + self.original_pattern = pattern_path + elements = _path_to_elements(pattern_path, root_element=('root', GETATTR)) + # Skip the root element for matching + self._pattern = elements[1:] + + def match(self, path_string): + """Return True if *path_string* matches this pattern exactly.""" + elements = _path_to_elements(path_string, root_element=('root', GETATTR)) + target = elements[1:] + return self._match_segments(self._pattern, target, 0, 0) + + def match_or_is_ancestor(self, path_string): + """Return True if *path_string* matches OR is an ancestor of a potential match. + + This is needed for ``include_paths``: we must not prune a path that + could lead to a matching descendant. + """ + elements = _path_to_elements(path_string, root_element=('root', GETATTR)) + target = elements[1:] + return (self._match_segments(self._pattern, target, 0, 0) or + self._could_match_descendant(self._pattern, target, 0, 0)) + + def match_or_is_descendant(self, path_string): + """Return True if *path_string* matches OR is a descendant of a matching path. + + This checks whether the pattern matches any prefix of *path_string*, + meaning the path is "inside" a matched subtree. + """ + elements = _path_to_elements(path_string, root_element=('root', GETATTR)) + target = elements[1:] + # Check exact match first + if self._match_segments(self._pattern, target, 0, 0): + return True + # Check if any prefix of target matches (making this path a descendant) + for length in range(len(target)): + if self._match_segments(self._pattern, target[:length], 0, 0): + return True + return False + + @staticmethod + def _match_segments(pattern, target, pi, ti): + """Recursive segment matcher with backtracking for ``**``.""" + while pi < len(pattern) and ti < len(target): + pat_elem = pattern[pi][0] + + if pat_elem == MULTI_WILDCARD: + # ** matches zero or more segments — try every suffix + for k in range(ti, len(target) + 1): + if GlobPathMatcher._match_segments(pattern, target, pi + 1, k): + return True + return False + elif pat_elem == SINGLE_WILDCARD: + # * matches exactly one segment regardless of value/action + pi += 1 + ti += 1 + else: + tgt_elem = target[ti][0] + if pat_elem != tgt_elem: + return False + pi += 1 + ti += 1 + + # Consume any trailing ** (they can match zero segments) + while pi < len(pattern) and pattern[pi][0] == MULTI_WILDCARD: + pi += 1 + + return pi == len(pattern) and ti == len(target) + + @staticmethod + def _could_match_descendant(pattern, target, pi, ti): + """Check if *target* is a prefix that could lead to a match deeper down.""" + if ti == len(target): + # Target exhausted — it's an ancestor if pattern has remaining segments + return pi < len(pattern) + + if pi >= len(pattern): + return False + + pat_elem = pattern[pi][0] + + if pat_elem == MULTI_WILDCARD: + return (GlobPathMatcher._could_match_descendant(pattern, target, pi + 1, ti) or + GlobPathMatcher._could_match_descendant(pattern, target, pi, ti + 1)) + elif pat_elem == SINGLE_WILDCARD: + return GlobPathMatcher._could_match_descendant(pattern, target, pi + 1, ti + 1) + else: + tgt_elem = target[ti][0] + if pat_elem != tgt_elem: + return False + return GlobPathMatcher._could_match_descendant(pattern, target, pi + 1, ti + 1) + + +def compile_glob_paths(paths): + """Compile a list of glob pattern strings into GlobPathMatcher objects. + + Returns a list of ``GlobPathMatcher`` or ``None`` if *paths* is empty/None. + """ + if not paths: + return None + return [GlobPathMatcher(p) for p in paths] diff --git a/deepdiff/search.py b/deepdiff/search.py index fdb73d79..9b1b11a1 100644 --- a/deepdiff/search.py +++ b/deepdiff/search.py @@ -6,7 +6,8 @@ import logging from deepdiff.helper import ( - strings, numbers, add_to_frozen_set, get_doc, dict_, RE_COMPILED_TYPE, ipranges + strings, numbers, add_to_frozen_set, get_doc, dict_, RE_COMPILED_TYPE, ipranges, + separate_wildcard_and_exact_paths, ) @@ -106,7 +107,8 @@ def __init__(self, self.obj: Any = obj self.case_sensitive: bool = case_sensitive if isinstance(item, strings) else True item = item if self.case_sensitive else (item.lower() if isinstance(item, str) else item) - self.exclude_paths: SetOrdered = SetOrdered(exclude_paths) + _exclude_exact, self.exclude_glob_paths = separate_wildcard_and_exact_paths(set(exclude_paths) if exclude_paths else None) + self.exclude_paths: SetOrdered = SetOrdered(_exclude_exact) if _exclude_exact else SetOrdered() self.exclude_regex_paths: List[Pattern[str]] = [re.compile(exclude_regex_path) for exclude_regex_path in exclude_regex_paths] self.exclude_types: SetOrdered = SetOrdered(exclude_types) self.exclude_types_tuple: tuple[type, ...] = tuple( @@ -193,6 +195,8 @@ def __skip_this(self, item: Any, parent: str) -> bool: skip = False if parent in self.exclude_paths: skip = True + elif self.exclude_glob_paths and any(gp.match(parent) for gp in self.exclude_glob_paths): + skip = True elif self.exclude_regex_paths and any( [exclude_regex_path.search(parent) for exclude_regex_path in self.exclude_regex_paths]): skip = True diff --git a/docs/deephash_doc.rst b/docs/deephash_doc.rst index da271b77..7039281f 100644 --- a/docs/deephash_doc.rst +++ b/docs/deephash_doc.rst @@ -32,10 +32,12 @@ exclude_types: list, default = None exclude_paths: list, default = None List of paths to exclude from the report. If only one item, you can pass it as a string instead of a list containing only one path. + Supports :ref:`wildcard_paths_label`: use ``[*]`` to match one segment or ``[**]`` to match any depth. include_paths: list, default = None List of the only paths to include in the report. If only one item, you can pass it as a string. + Supports :ref:`wildcard_paths_label`: use ``[*]`` to match one segment or ``[**]`` to match any depth. exclude_regex_paths: list, default = None diff --git a/docs/diff_doc.rst b/docs/diff_doc.rst index e01dab29..1fc18db6 100644 --- a/docs/diff_doc.rst +++ b/docs/diff_doc.rst @@ -55,7 +55,8 @@ encodings: List, default = None exclude_paths: list, default = None :ref:`exclude_paths_label` - List of paths to exclude from the report. If only one item, you can path it as a string. + List of paths to exclude from the report. If only one item, you can pass it as a string. + Supports :ref:`wildcard_paths_label`: use ``[*]`` to match one segment or ``[**]`` to match any depth. exclude_regex_paths: list, default = None :ref:`exclude_regex_paths_label` @@ -77,6 +78,7 @@ exclude_obj_callback_strict: function, default = None include_paths: list, default = None :ref:`include_paths_label` List of the only paths to include in the report. If only one item is in the list, you can pass it as a string. + Supports :ref:`wildcard_paths_label`: use ``[*]`` to match one segment or ``[**]`` to match any depth. include_obj_callback: function, default = None :ref:`include_obj_callback_label` diff --git a/docs/exclude_paths.rst b/docs/exclude_paths.rst index 2de453ba..0c9b78a6 100644 --- a/docs/exclude_paths.rst +++ b/docs/exclude_paths.rst @@ -59,6 +59,49 @@ Example {'values_changed': {"root['foo']['bar']": {'new_value': 'banana', 'old_value': 'potato'}}} +.. _wildcard_paths_label: + +Wildcard (Glob) Paths +--------------------- + +Both ``exclude_paths`` and ``include_paths`` support wildcard patterns for matching multiple paths at once: + +- ``[*]`` or ``.*`` matches exactly **one** path segment (any key, index, or attribute). +- ``[**]`` or ``.**`` matches **zero or more** path segments at any depth. + +Wildcard patterns must use the full ``root`` prefix (shorthand keys are not supported for wildcards). + +Exclude all ``password`` fields regardless of the parent key: + >>> t1 = {"users": {"alice": {"name": "Alice", "password": "s1"}, "bob": {"name": "Bob", "password": "s2"}}} + >>> t2 = {"users": {"alice": {"name": "Alice", "password": "x1"}, "bob": {"name": "Bob", "password": "x2"}}} + >>> DeepDiff(t1, t2, exclude_paths=["root['users'][*]['password']"]) + {} + +Include only ``name`` fields at any depth: + >>> t1 = {"a": {"name": "A", "secret": 1}, "b": {"name": "B", "secret": 2}} + >>> t2 = {"a": {"name": "X", "secret": 1}, "b": {"name": "Y", "secret": 2}} + >>> result = DeepDiff(t1, t2, include_paths=["root[*]['name']"]) + >>> set(result.get('values_changed', {}).keys()) == {"root['a']['name']", "root['b']['name']"} + True + +Use ``[**]`` to match at any depth: + >>> t1 = {"config": {"db": {"password": "old"}, "cache": {"password": "old"}}} + >>> t2 = {"config": {"db": {"password": "new"}, "cache": {"password": "new"}}} + >>> DeepDiff(t1, t2, exclude_paths=["root[**]['password']"]) + {} + +Literal keys named ``*`` or ``**`` are not treated as wildcards when quoted: + >>> t1 = {"*": 1, "a": 2} + >>> t2 = {"*": 10, "a": 20} + >>> result = DeepDiff(t1, t2, exclude_paths=["root['*']"]) + >>> "root['a']" in result.get('values_changed', {}) + True + +When both ``exclude_paths`` and ``include_paths`` apply to the same path, exclusion takes precedence. + +Wildcards also work with ``DeepHash`` and ``DeepSearch`` exclude_paths. + + .. _exclude_regex_paths_label: Exclude Regex Paths diff --git a/docs/search_doc.rst b/docs/search_doc.rst index 0b268735..7039281f 100644 --- a/docs/search_doc.rst +++ b/docs/search_doc.rst @@ -1,74 +1,388 @@ :orphan: -grep is a more user friendly interface for DeepSearch. It takes exactly the same arguments as DeepSearch except that you pipe the object into it instead of passing it as a parameter. +**DeepHash** -It works just like grep in linux shell! +DeepHash calculates the hash of objects based on their contents in a deterministic way. +This way 2 objects with the same content should have the same hash. + +The main usage of DeepHash is to calculate the hash of otherwise unhashable objects. +For example you can use DeepHash to calculate the hash of a set or a dictionary! + +At the core of it, DeepHash is a deterministic serialization of your object into a string so it +can be passed to a hash function. By default it uses SHA256. You have the option to pass any other hashing function to be used instead. + +**Import** + >>> from deepdiff import DeepHash **Parameters** -item : The item to search for +obj : any object, The object to be hashed based on its content. + + +apply_hash: Boolean, default = True + DeepHash at its core is doing deterministic serialization of objects into strings. + Then it hashes the string. + The only time you want the apply_hash to be False is if you want to know what + the string representation of your object is BEFORE it gets hashed. + + +exclude_types: list, default = None + List of object types to exclude from hashing. + + +exclude_paths: list, default = None + List of paths to exclude from the report. If only one item, you can pass it as a string instead of a list containing only one path. + Supports :ref:`wildcard_paths_label`: use ``[*]`` to match one segment or ``[**]`` to match any depth. + + +include_paths: list, default = None + List of the only paths to include in the report. If only one item, you can pass it as a string. + Supports :ref:`wildcard_paths_label`: use ``[*]`` to match one segment or ``[**]`` to match any depth. + + +exclude_regex_paths: list, default = None + List of string regex paths or compiled regex paths objects to exclude from the report. If only one item, you can pass it as a string instead of a list containing only one regex path. + + +exclude_obj_callback + function, default = None + A function that takes the object and its path and returns a Boolean. If True is returned, the object is excluded from the results, otherwise it is included. + This is to give the user a higher level of control than one can achieve via exclude_paths, exclude_regex_paths or other means. + + +encodings: List, default = None + Character encodings to iterate through when we convert bytes into strings. You may want to pass an explicit list of encodings in your objects if you start getting UnicodeDecodeError from DeepHash. Also check out ignore_encoding_errors if you can get away with ignoring these errors and don't want to bother with an explicit list of encodings but it will come at the price of slightly less accuracy of the final results. Example: encodings=["utf-8", "latin-1"] + + +hashes: dictionary, default = empty dictionary + A dictionary of {object or object id: object hash} to start with. + Any object that is encountered and it is already in the hashes dictionary or its id is in the hashes dictionary, + will re-use the hash that is provided by this dictionary instead of re-calculating + its hash. This is typically used when you have a series of objects to be hashed and there might be repeats of the same object. + + +hasher: function. default = DeepHash.sha256hex + hasher is the hashing function. The default is DeepHash.sha256hex. + But you can pass another hash function to it if you want. + For example a cryptographic hash function or Python's builtin hash function. + All it needs is a function that takes the input in string format and returns the hash. + + You can use it by passing: hasher=hash for Python's builtin hash. + + The following alternative is already provided: + + - hasher=DeepHash.sha1hex + + Note that prior to DeepDiff 5.2, Murmur3 was the default hash function. + But Murmur3 is removed from DeepDiff dependencies since then. + + +ignore_repetition: Boolean, default = True + If repetitions in an iterable should cause the hash of iterable to be different. + Note that the deepdiff diffing functionality lets this to be the default at all times. + But if you are using DeepHash directly, you can set this parameter. + -verbose_level : int >= 0, default = 1. - Verbose level one shows the paths of found items. - Verbose level 2 shows the path and value of the found items. +ignore_type_in_groups + Ignore type changes between members of groups of types. For example if you want to ignore type changes between float and decimals etc. Note that this is a more granular feature. Most of the times the shortcuts provided to you are enough. + The shortcuts are ignore_string_type_changes which by default is False and ignore_numeric_type_changes which is by default False. You can read more about those shortcuts in this page. ignore_type_in_groups gives you more control compared to the shortcuts. -exclude_paths: list, default = None. - List of paths to exclude from the report. + For example lets say you have specifically str and byte datatypes to be ignored for type changes. Then you have a couple of options: -exclude_types: list, default = None. - List of object types to exclude from the report. + 1. Set ignore_string_type_changes=True which is the default. + 2. Set ignore_type_in_groups=[(str, bytes)]. Here you are saying if we detect one type to be str and the other one bytes, do not report them as type change. It is exactly as passing ignore_type_in_groups=[DeepDiff.strings] or ignore_type_in_groups=DeepDiff.strings . -case_sensitive: Boolean, default = False + Now what if you want also typeA and typeB to be ignored when comparing agains each other? -match_string: Boolean, default = False - If True, the value of the object or its children have to exactly match the item. - If False, the value of the item can be a part of the value of the object or its children + 1. ignore_type_in_groups=[DeepDiff.strings, (typeA, typeB)] + 2. or ignore_type_in_groups=[(str, bytes), (typeA, typeB)] -use_regexp: Boolean, default = False +ignore_string_type_changes: Boolean, default = True + string type conversions should not affect the hash output when this is set to True. + For example "Hello" and b"Hello" should produce the same hash. -strict_checking: Boolean, default = True - If True, it will check the type of the object to match, so when searching for '1234', - it will NOT match the int 1234. Currently this only affects the numeric values searching. + By setting it to True, both the string and bytes of hello return the same hash. + + +ignore_numeric_type_changes: Boolean, default = False + numeric type conversions should not affect the hash output when this is set to True. + For example 10, 10.0 and Decimal(10) should produce the same hash. + When ignore_numeric_type_changes is set to True, all numbers are converted + to strings with the precision of significant_digits parameter and number_format_notation notation. + If no significant_digits is passed by the user, a default value of 12 is used. + + +ignore_type_subclasses + Use ignore_type_subclasses=True so when ignoring type (class), the subclasses of that class are ignored too. + + +ignore_string_case + Whether to be case-sensitive or not when comparing strings. By settings ignore_string_case=False, strings will be compared case-insensitively. + + +ignore_private_variables: Boolean, default = True + Whether to exclude the private variables in the calculations or not. It only affects variables that start with double underscores (__). + + +ignore_encoding_errors: Boolean, default = False + If you want to get away with UnicodeDecodeError without passing explicit character encodings, set this option to True. If you want to make sure the encoding is done properly, keep this as False and instead pass an explicit list of character encodings to be considered via the encodings parameter. + +ignore_iterable_order: Boolean, default = True + If order of items in an iterable should not cause the hash of the iterable to be different. + +number_format_notation : string, default="f" + number_format_notation is what defines the meaning of significant digits. The default value of "f" means the digits AFTER the decimal point. "f" stands for fixed point. The other option is "e" which stands for exponent notation or scientific notation. + + +significant_digits : int >= 0, default=None + By default the significant_digits compares only that many digits AFTER the decimal point. However you can set override that by setting the number_format_notation="e" which will make it mean the digits in scientific notation. + + Important: This will affect ANY number comparison when it is set. + + Note: If ignore_numeric_type_changes is set to True and you have left significant_digits to the default of None, it gets automatically set to 12. The reason is that normally when numbers from 2 different types are compared, instead of comparing the values, we only report the type change. However when ignore_numeric_type_changes=True, in order compare numbers from different types to each other, we need to convert them all into strings. The significant_digits will be used to make sure we accurately convert all the numbers into strings in order to report the changes between them. + + Internally it uses "{:.Xf}".format(Your Number) to compare numbers where X=significant_digits when the number_format_notation is left as the default of "f" meaning fixed point. + + Note that "{:.3f}".format(1.1135) = 1.113, but "{:.3f}".format(1.11351) = 1.114 + + For Decimals, Python's format rounds 2.5 to 2 and 3.5 to 4 (to the closest even number) + + When you set the number_format_notation="e", we use "{:.Xe}".format(Your Number) where X=significant_digits. + +truncate_datetime: string, default = None + Can take value one of 'second', 'minute', 'hour', 'day' and truncate with this value datetime objects before hashing it + + + +**Returns** + A dictionary of {item: item hash}. + If your object is nested, it will build hashes of all the objects it contains too. + + +.. note:: + DeepHash output is not like conventional hash functions. It is a dictionary of object IDs to their hashes. This happens because DeepHash calculates the hash of the object and any other objects found within the object in a recursive manner. If you only need the hash of the object you are passing, all you need to do is to do: + + >>> from deepdiff import DeepHash + >>> obj = {1: 2, 'a': 'b'} + >>> DeepHash(obj)[obj] # doctest: +SKIP **Examples** -Importing - >>> from deepdiff import grep - >>> from pprint import pprint - -Search in list for string - >>> obj = ["long somewhere", "string", 0, "somewhere great!"] - >>> item = "somewhere" - >>> ds = obj | grep(item) - >>> print(ds) - {'matched_values': ['root[0]', 'root[3]']} - -Search in nested data for string - >>> obj = ["something somewhere", {"long": "somewhere", "string": 2, 0: 0, "somewhere": "around"}] - >>> item = "somewhere" - >>> ds = obj | grep(item, verbose_level=2) - >>> pprint(ds, indent=2) - { 'matched_paths': {"root[1]['somewhere']": 'around'}, - 'matched_values': { 'root[0]': 'something somewhere', - "root[1]['long']": 'somewhere'}} - -You can also use regular expressions - >>> obj = ["something here", {"long": "somewhere", "someone": 2, 0: 0, "somewhere": "around"}] - >>> ds = obj | grep("some.*", use_regexp=True) - >>> pprint(ds, indent=2) - { 'matched_paths': ["root[1]['someone']", "root[1]['somewhere']"], - 'matched_values': ['root[0]', "root[1]['long']"]} - - -Change strict_checking to False to match numbers in strings and vice versa: - >>> obj = {"long": "somewhere", "num": 1123456, 0: 0, "somewhere": "around"} - >>> item = "1234" - >>> result = {"matched_values": {"root['num']"}} - >>> ds = obj | grep(item, verbose_level=1, use_regexp=True) - >>> pprint(ds) - {} +Let's say you have a dictionary object. + >>> from deepdiff import DeepHash + >>> obj = {1: 2, 'a': 'b'} + +If you try to hash it: + >>> hash(obj) + Traceback (most recent call last): + File "", line 1, in + TypeError: unhashable type: 'dict' + +But with DeepHash: + + >>> from deepdiff import DeepHash + >>> obj = {1: 2, 'a': 'b'} + >>> DeepHash(obj) # doctest: +SKIP + + So what is exactly the hash of obj in this case? + DeepHash is calculating the hash of the obj and any other object that obj contains. + The output of DeepHash is a dictionary of object IDs to their hashes. + In order to get the hash of obj itself, you need to use the object (or the id of object) to get its hash: + + >>> hashes = DeepHash(obj) + >>> hashes[obj] + 'bf5478de322aa033da36bf3bcf9f0599e13a520773f50c6eb9f2487377a7929b' + + Which you can write as: + + >>> hashes = DeepHash(obj)[obj] + + At first it might seem weird why DeepHash(obj)[obj] but remember that DeepHash(obj) is a dictionary of hashes of all other objects that obj contains too. + + If you prefer to use another hashing algorithm, you can pass it using the hasher parameter. + + If you do a deep copy of the obj, it should still give you the same hash: + + >>> from copy import deepcopy + >>> obj2 = deepcopy(obj) + >>> DeepHash(obj2)[obj2] + 'bf5478de322aa033da36bf3bcf9f0599e13a520773f50c6eb9f2487377a7929b' + + Note that by default DeepHash will include string type differences. So if your strings were bytes: + + >>> obj3 = {1: 2, b'a': b'b'} + >>> DeepHash(obj3)[obj3] + '71db3231177d49f78b52a356ca206e6179417b681604d00ed703a077049e3300' + + But if you want the same hash if string types are different, set ignore_string_type_changes to True: + + >>> DeepHash(obj3, ignore_string_type_changes=True)[obj3] + 'e60c2befb84be625037c75e1e26d0bfc85a0ffc1f3cde9500f68f6eac55e5ad6' + + ignore_numeric_type_changes is by default False too. + + >>> from decimal import Decimal + >>> obj1 = {4:10} + >>> obj2 = {4.0: Decimal(10.0)} + >>> DeepHash(obj1)[4] == DeepHash(obj2)[4.0] + False + + But by setting it to True, we can get the same hash. + + >>> DeepHash(obj1, ignore_numeric_type_changes=True)[4] == DeepHash(obj2, ignore_numeric_type_changes=True)[4.0] + True + +number_format_notation: String, default = "f" + number_format_notation is what defines the meaning of significant digits. The default value of "f" means the digits AFTER the decimal point. "f" stands for fixed point. The other option is "e" which stands for exponent notation or scientific notation. + + +ignore_string_type_changes: Boolean, default = True + By setting it to True, both the string and bytes of hello return the same hash. + + >>> DeepHash(b'hello', ignore_string_type_changes=True)[b'hello'] + '2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824' + >>> DeepHash('hello', ignore_string_type_changes=True)['hello'] + '2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824' + + +ignore_numeric_type_changes: Boolean, default = False + For example if significant_digits=5, 1.1, Decimal(1.1) are both converted to 1.10000 + + That way they both produce the same hash. + + >>> t1 = {1: 1, 2: 2.22} + >>> DeepHash(t1)[1] + 'c1800a30c736483f13615542e7096f7973631fef8ca935ee1ed9f35fb06fd44e' + >>> DeepHash(t1, ignore_numeric_type_changes=True)[1] == DeepHash(t1, ignore_numeric_type_changes=True)[1.0] + True + + You can pass a list of tuples or list of lists if you have various type groups. When t1 and t2 both fall under one of these type groups, the type change will be ignored. DeepDiff already comes with 2 groups: DeepDiff.strings and DeepDiff.numbers . If you want to pass both: + + >>> from deepdiff import DeepDiff + >>> ignore_type_in_groups = [DeepDiff.strings, DeepDiff.numbers] + + +ignore_type_in_groups example with custom objects: + + >>> class Burrito: + ... bread = 'flour' + ... def __init__(self): + ... self.spicy = True + ... + >>> + >>> class Taco: + ... bread = 'flour' + ... def __init__(self): + ... self.spicy = True + ... + >>> + >>> burrito = Burrito() + >>> taco = Taco() + >>> + >>> burritos = [burrito] + >>> tacos = [taco] + >>> + >>> d1 = DeepHash(burritos, ignore_type_in_groups=[(Taco, Burrito)]) + >>> d2 = DeepHash(tacos, ignore_type_in_groups=[(Taco, Burrito)]) + >>> d1[burrito] == d2[taco] + True + + +ignore_type_subclasses + Use ignore_type_subclasses=True so when ignoring type (class), the subclasses of that class are ignored too. + + >>> from deepdiff import DeepHash + >>> + >>> class ClassB: + ... def __init__(self, x): + ... self.x = x + ... def __repr__(self): + ... return "obj b" + ... + >>> + >>> class ClassC(ClassB): + ... def __repr__(self): + ... return "obj c" + ... + >>> obj_b = ClassB(1) + >>> obj_c = ClassC(1) + >>> + >>> # By default, subclasses are considered part of the type group. + ... # ignore_type_in_groups=[(ClassB, )] matches ClassC too since it's a subclass. + ... hashes_b = DeepHash(obj_b, ignore_type_in_groups=[(ClassB, )]) + >>> hashes_c = DeepHash(obj_c, ignore_type_in_groups=[(ClassB, )]) + >>> hashes_b[obj_b] == hashes_c[obj_c] + True + >>> + >>> # With ignore_type_subclasses=True, only exact type matches count. + ... # ClassC no longer matches (ClassB, ) group, so hashes differ. + ... hashes_b = DeepHash(obj_b, ignore_type_in_groups=[(ClassB, )], ignore_type_subclasses=True) + >>> hashes_c = DeepHash(obj_c, ignore_type_in_groups=[(ClassB, )], ignore_type_subclasses=True) + >>> hashes_b[obj_b] != hashes_c[obj_c] + True + +ignore_string_case + Whether to be case-sensitive or not when comparing strings. By settings ignore_string_case=False, strings will be compared case-insensitively. + + >>> from deepdiff import DeepHash + >>> DeepHash('hello')['hello'] == DeepHash('heLLO')['heLLO'] + False + >>> DeepHash('hello', ignore_string_case=True)['hello'] == DeepHash('heLLO', ignore_string_case=True)['heLLO'] + True + +exclude_obj_callback + function, default = None + A function that takes the object and its path and returns a Boolean. If True is returned, the object is excluded from the results, otherwise it is included. + This is to give the user a higher level of control than one can achieve via exclude_paths, exclude_regex_paths or other means. + + >>> def exclude_obj_callback(obj, path): + ... return True if isinstance(obj, str) and obj in ('x', 'y') else False + ... + >>> dic1 = {"x": 1, "y": 2, "z": 3} + >>> t1 = [dic1] + >>> t1_hash = DeepHash(t1, exclude_obj_callback=exclude_obj_callback) + >>> + >>> dic2 = {"z": 3} + >>> t2 = [dic2] + >>> t2_hash = DeepHash(t2, exclude_obj_callback=exclude_obj_callback) + >>> + >>> t1_hash[t1] == t2_hash[t2] + True + +number_format_notation : string, default="f" + When numbers are converted to the string, you have the choices between "f" as fixed point and "e" as scientific notation: + + >>> t1=10002 + >>> t2=10004 + >>> t1_hash = DeepHash(t1, significant_digits=3, number_format_notation="f") + >>> t2_hash = DeepHash(t2, significant_digits=3, number_format_notation="f") + >>> + >>> t1_hash[t1] == t2_hash[t2] + False + >>> + >>> + >>> # Now we use the scientific notation + ... t1_hash = DeepHash(t1, significant_digits=3, number_format_notation="e") + >>> t2_hash = DeepHash(t2, significant_digits=3, number_format_notation="e") >>> - >>> ds = obj | grep(item, verbose_level=1, use_regexp=True, strict_checking=False) - >>> pprint(ds) - {'matched_values': ["root['num']"]} + >>> t1_hash[t1] == t2_hash[t2] + True + +Defining your own number_to_string_func + Lets say you want the hash of numbers below 100 to be the same for some reason. + + >>> from deepdiff import DeepHash + >>> from deepdiff.helper import number_to_string + >>> def custom_number_to_string(number, *args, **kwargs): + ... number = 100 if number < 100 else number + ... return number_to_string(number, *args, **kwargs) + ... + >>> t1 = [10, 12, 100000] + >>> t2 = [50, 63, 100021] + >>> t1_hash = DeepHash(t1, significant_digits=3, number_format_notation="e", number_to_string_func=custom_number_to_string) + >>> t2_hash = DeepHash(t2, significant_digits=3, number_format_notation="e", number_to_string_func=custom_number_to_string) + >>> t1_hash[t1] == t2_hash[t2] + True + + So both lists produced the same hash thanks to the low significant digits for 100000 vs 100021 and also the custom_number_to_string that converted all numbers below 100 to be 100! diff --git a/tests/test_glob_paths.py b/tests/test_glob_paths.py new file mode 100644 index 00000000..d97ead2b --- /dev/null +++ b/tests/test_glob_paths.py @@ -0,0 +1,719 @@ +import pytest +from deepdiff import DeepDiff, DeepHash, DeepSearch, grep +from deepdiff.path import ( + GlobPathMatcher, compile_glob_paths, path_has_wildcard, + _path_to_elements, SINGLE_WILDCARD, MULTI_WILDCARD, +) +from deepdiff.helper import separate_wildcard_and_exact_paths + + +# ── path_has_wildcard detection ────────────────────────────────────── + + +class TestPathHasWildcard: + + @pytest.mark.parametrize("path, expected", [ + ("root[*]", True), + ("root[**]", True), + ("root.*", True), + ("root.**", True), + ("root['users'][*]['name']", True), + ("root[**]['password']", True), + ("root['*']", False), # literal key named '*' + ("root['**']", False), # literal key named '**' + ("root['foo']['bar']", False), + ("root[0][1]", False), + ("root.foo.bar", False), + ("root[*][*]", True), # multiple wildcards + ("root[**][**]", True), + ("root.*.bar.*", True), # multiple dot wildcards + ]) + def test_detection(self, path, expected): + assert path_has_wildcard(path) is expected + + +# ── _path_to_elements parsing of wildcards ─────────────────────────── + + +class TestWildcardParsing: + + @pytest.mark.parametrize("path, expected", [ + ("root[*]", (('root', 'GETATTR'), (SINGLE_WILDCARD, 'GET'))), + ("root[**]", (('root', 'GETATTR'), (MULTI_WILDCARD, 'GET'))), + ("root['users'][*]['password']", ( + ('root', 'GETATTR'), ('users', 'GET'), (SINGLE_WILDCARD, 'GET'), ('password', 'GET'), + )), + ("root[**]['secret']", ( + ('root', 'GETATTR'), (MULTI_WILDCARD, 'GET'), ('secret', 'GET'), + )), + ("root.*.name", ( + ('root', 'GETATTR'), (SINGLE_WILDCARD, 'GETATTR'), ('name', 'GETATTR'), + )), + ("root[*][*]", ( + ('root', 'GETATTR'), (SINGLE_WILDCARD, 'GET'), (SINGLE_WILDCARD, 'GET'), + )), + ]) + def test_parsing(self, path, expected): + assert _path_to_elements(path) == expected + + def test_literal_star_key_not_wildcard(self): + """root['*'] should parse as a literal string '*', not a wildcard token.""" + elems = _path_to_elements("root['*']") + # The element should be a plain string, not a _WildcardToken + assert elems[1][0] == '*' + assert elems[1][0] != SINGLE_WILDCARD + assert isinstance(elems[1][0], str) + + def test_literal_double_star_key_not_wildcard(self): + """root['**'] should parse as a literal string '**', not a wildcard token.""" + elems = _path_to_elements("root['**']") + assert elems[1][0] == '**' + assert elems[1][0] != MULTI_WILDCARD + assert isinstance(elems[1][0], str) + + def test_wildcard_token_repr(self): + """_WildcardToken repr should return the symbol string.""" + assert repr(SINGLE_WILDCARD) == '*' + assert repr(MULTI_WILDCARD) == '**' + + def test_wildcard_token_hash(self): + """_WildcardToken instances should be hashable and usable in sets/dicts.""" + s = {SINGLE_WILDCARD, MULTI_WILDCARD} + assert len(s) == 2 + assert SINGLE_WILDCARD in s + d = {SINGLE_WILDCARD: 'one', MULTI_WILDCARD: 'many'} + assert d[SINGLE_WILDCARD] == 'one' + + +# ── separate_wildcard_and_exact_paths ──────────────────────────────── + + +class TestSeparateWildcardPaths: + + def test_none_input(self): + exact, globs = separate_wildcard_and_exact_paths(None) + assert exact is None + assert globs is None + + def test_empty_input(self): + exact, globs = separate_wildcard_and_exact_paths(set()) + assert exact is None + assert globs is None + + def test_all_exact(self): + exact, globs = separate_wildcard_and_exact_paths({"root['foo']", "root['bar']"}) + assert exact == {"root['foo']", "root['bar']"} + assert globs is None + + def test_all_wildcards(self): + exact, globs = separate_wildcard_and_exact_paths({"root[*]", "root[**]['x']"}) + assert exact is None + assert len(globs) == 2 + + def test_mixed(self): + exact, globs = separate_wildcard_and_exact_paths( + {"root['foo']", "root[*]['bar']"} + ) + assert exact == {"root['foo']"} + assert len(globs) == 1 + assert globs[0].original_pattern == "root[*]['bar']" + + def test_wildcard_must_start_with_root(self): + with pytest.raises(ValueError, match="Wildcard paths must start with 'root'"): + separate_wildcard_and_exact_paths({"[*]['foo']"}) + + +# ── GlobPathMatcher.match ──────────────────────────────────────────── + + +class TestGlobPathMatcherMatch: + + # ── single wildcard [*] ── + + @pytest.mark.parametrize("target, expected", [ + ("root['a']", True), + ("root[0]", True), + ("root[99]", True), + ("root", False), # too short + ("root['a']['b']", False), # too long + ]) + def test_single_wildcard_basic(self, target, expected): + m = GlobPathMatcher("root[*]") + assert m.match(target) is expected + + @pytest.mark.parametrize("target, expected", [ + ("root['users']['alice']['password']", True), + ("root['users'][0]['password']", True), + ("root['users'][99]['password']", True), + ("root['users']['password']", False), # missing middle segment + ("root['users']['a']['b']['password']", False), # too many middle segments + ("root['users']['alice']['email']", False), # wrong last segment + ]) + def test_single_wildcard_in_middle(self, target, expected): + m = GlobPathMatcher("root['users'][*]['password']") + assert m.match(target) is expected + + def test_multiple_single_wildcards(self): + """root[*][*] matches exactly two segments after root.""" + m = GlobPathMatcher("root[*][*]") + assert m.match("root['a']['b']") is True + assert m.match("root[0][1]") is True + assert m.match("root['a']") is False + assert m.match("root['a']['b']['c']") is False + + # ── double wildcard [**] ── + + @pytest.mark.parametrize("target, expected", [ + ("root", True), # zero segments + ("root['a']", True), # one segment + ("root['a']['b']['c']", True), # many segments + ("root[0][1][2]", True), # numeric indices + ]) + def test_double_wildcard_standalone(self, target, expected): + m = GlobPathMatcher("root[**]") + assert m.match(target) is expected + + @pytest.mark.parametrize("target, expected", [ + ("root['password']", True), # ** matches zero + ("root['a']['password']", True), # ** matches one + ("root['a']['b']['c']['password']", True), # ** matches many + ("root['a']['b']", False), # doesn't end with password + ("root['password']['extra']", False), # extra after password + ]) + def test_double_wildcard_before_key(self, target, expected): + m = GlobPathMatcher("root[**]['password']") + assert m.match(target) is expected + + def test_double_wildcard_both_ends(self): + m = GlobPathMatcher("root[**]['config'][**]['value']") + assert m.match("root['config']['value']") is True + assert m.match("root['a']['config']['value']") is True + assert m.match("root['a']['config']['b']['c']['value']") is True + assert m.match("root['config']['x']") is False + assert m.match("root['value']") is False + + def test_double_wildcard_zero_match_in_middle(self): + """** between two fixed segments can match zero segments.""" + m = GlobPathMatcher("root['a'][**]['b']") + assert m.match("root['a']['b']") is True # ** matches zero + assert m.match("root['a']['x']['b']") is True # ** matches one + assert m.match("root['a']['x']['y']['b']") is True # ** matches two + + def test_adjacent_double_wildcards(self): + m = GlobPathMatcher("root[**][**]['x']") + assert m.match("root['x']") is True + assert m.match("root['a']['x']") is True + assert m.match("root['a']['b']['x']") is True + + # ── dot notation wildcards ── + + def test_dot_single_wildcard(self): + m = GlobPathMatcher("root.*.name") + assert m.match("root.user.name") is True + assert m.match("root.name") is False + + def test_dot_double_wildcard(self): + m = GlobPathMatcher("root.**.name") + assert m.match("root.name") is True + assert m.match("root.a.name") is True + assert m.match("root.a.b.name") is True + + # ── mixed bracket and dot ── + + def test_mixed_bracket_and_dot_wildcard(self): + m = GlobPathMatcher("root[*].name") + assert m.match("root['user'].name") is True + assert m.match("root[0].name") is True + + +# ── GlobPathMatcher.match_or_is_ancestor ───────────────────────────── + + +class TestGlobPathMatcherAncestor: + + def test_ancestor_of_double_wildcard(self): + m = GlobPathMatcher("root[**]['password']") + assert m.match_or_is_ancestor("root['users']") is True + assert m.match_or_is_ancestor("root") is True + + def test_match_also_returns_true(self): + m = GlobPathMatcher("root[**]['password']") + assert m.match_or_is_ancestor("root['password']") is True + + def test_any_path_is_ancestor_with_double_wildcard(self): + """With ** in the pattern, any intermediate path could lead to a match.""" + m = GlobPathMatcher("root[**]['password']") + assert m.match_or_is_ancestor("root['x']") is True + assert m.match_or_is_ancestor("root['x']['y']['z']") is True + + def test_single_wildcard_ancestor_positive(self): + m = GlobPathMatcher("root['users'][*]['password']") + assert m.match_or_is_ancestor("root['users']") is True + assert m.match_or_is_ancestor("root") is True + + def test_single_wildcard_ancestor_negative(self): + """A path that diverges from a single-wildcard pattern is not an ancestor.""" + m = GlobPathMatcher("root['users'][*]['password']") + assert m.match_or_is_ancestor("root['other']") is False + + +# ── GlobPathMatcher.match_or_is_descendant ─────────────────────────── + + +class TestGlobPathMatcherDescendant: + + def test_descendant_of_match(self): + m = GlobPathMatcher("root[**]['config']") + assert m.match_or_is_descendant("root['config']['value']") is True + assert m.match_or_is_descendant("root['config']['a']['b']") is True + + def test_exact_match(self): + m = GlobPathMatcher("root[**]['config']") + assert m.match_or_is_descendant("root['config']") is True + + def test_not_descendant_or_match(self): + m = GlobPathMatcher("root[**]['secret']") + assert m.match_or_is_descendant("root['config']['db']['host']") is False + + def test_ancestor_is_not_descendant(self): + m = GlobPathMatcher("root['users'][*]['password']") + assert m.match_or_is_descendant("root['users']") is False + + def test_descendant_of_single_wildcard_match(self): + m = GlobPathMatcher("root[*]") + assert m.match_or_is_descendant("root['a']['nested']") is True + + +# ── compile_glob_paths ─────────────────────────────────────────────── + + +class TestCompileGlobPaths: + + def test_none_returns_none(self): + assert compile_glob_paths(None) is None + + def test_empty_returns_none(self): + assert compile_glob_paths([]) is None + + def test_compiles_list(self): + result = compile_glob_paths(["root[*]", "root[**]['x']"]) + assert len(result) == 2 + assert all(isinstance(r, GlobPathMatcher) for r in result) + + +# ── DeepDiff integration: exclude_paths with wildcards ─────────────── + + +class TestDeepDiffExcludeGlob: + + def test_exclude_single_wildcard(self): + t1 = {'users': {'alice': {'name': 'Alice', 'pw': 's1'}, 'bob': {'name': 'Bob', 'pw': 's2'}}} + t2 = {'users': {'alice': {'name': 'Alice', 'pw': 'c1'}, 'bob': {'name': 'Bobby', 'pw': 'c2'}}} + diff = DeepDiff(t1, t2, exclude_paths=["root['users'][*]['pw']"]) + changed = diff.get('values_changed', {}) + assert "root['users']['bob']['name']" in changed + assert "root['users']['alice']['pw']" not in changed + assert "root['users']['bob']['pw']" not in changed + + def test_exclude_double_wildcard(self): + t1 = { + 'config': {'db': {'host': 'localhost', 'secret': 'abc'}, + 'api': {'nested': {'secret': 'xyz'}}}, + 'name': 'app' + } + t2 = { + 'config': {'db': {'host': 'remotehost', 'secret': 'def'}, + 'api': {'nested': {'secret': 'uvw'}}}, + 'name': 'app2' + } + diff = DeepDiff(t1, t2, exclude_paths=["root[**]['secret']"]) + changed = diff.get('values_changed', {}) + assert "root['config']['db']['host']" in changed + assert "root['name']" in changed + assert "root['config']['db']['secret']" not in changed + assert "root['config']['api']['nested']['secret']" not in changed + + def test_exclude_wildcard_with_list(self): + t1 = [{'name': 'Alice', 'age': 30}, {'name': 'Bob', 'age': 25}] + t2 = [{'name': 'Alice', 'age': 31}, {'name': 'Bobby', 'age': 26}] + diff = DeepDiff(t1, t2, exclude_paths=["root[*]['age']"]) + changed = diff.get('values_changed', {}) + assert "root[1]['name']" in changed + assert "root[0]['age']" not in changed + assert "root[1]['age']" not in changed + + def test_exclude_mix_exact_and_wildcard(self): + t1 = {'a': 1, 'b': 2, 'c': {'d': 3, 'e': 4}} + t2 = {'a': 10, 'b': 20, 'c': {'d': 30, 'e': 40}} + diff = DeepDiff(t1, t2, exclude_paths=["root['a']", "root['c'][*]"]) + changed = diff.get('values_changed', {}) + assert "root['b']" in changed + assert "root['a']" not in changed + assert "root['c']['d']" not in changed + assert "root['c']['e']" not in changed + + def test_exclude_nested_list_of_dicts(self): + t1 = {'data': [{'id': 1, 'meta': {'ts': 100}}, {'id': 2, 'meta': {'ts': 200}}]} + t2 = {'data': [{'id': 1, 'meta': {'ts': 999}}, {'id': 2, 'meta': {'ts': 888}}]} + diff = DeepDiff(t1, t2, exclude_paths=["root['data'][*]['meta']"]) + assert diff == {} + + def test_exclude_with_type_changes(self): + t1 = {'a': {'x': 1, 'y': 'hello'}} + t2 = {'a': {'x': 'changed_type', 'y': 'world'}} + diff = DeepDiff(t1, t2, exclude_paths=["root[*]['x']"]) + changed = diff.get('values_changed', {}) + assert "root['a']['y']" in changed + assert 'type_changes' not in diff + + +# ── DeepDiff integration: include_paths with wildcards ─────────────── + + +class TestDeepDiffIncludeGlob: + + def test_include_single_wildcard(self): + t1 = {'users': {'alice': {'name': 'Alice', 'pw': 's1'}, 'bob': {'name': 'Bob', 'pw': 's2'}}} + t2 = {'users': {'alice': {'name': 'Alice2', 'pw': 'c1'}, 'bob': {'name': 'Bobby', 'pw': 'c2'}}} + diff = DeepDiff(t1, t2, include_paths=["root['users'][*]['name']"]) + changed = diff.get('values_changed', {}) + assert "root['users']['alice']['name']" in changed + assert "root['users']['bob']['name']" in changed + assert "root['users']['alice']['pw']" not in changed + assert "root['users']['bob']['pw']" not in changed + + def test_include_double_wildcard(self): + t1 = { + 'config': {'db': {'host': 'localhost', 'secret': 'abc'}, + 'api': {'url': 'http://api', 'nested': {'secret': 'xyz'}}}, + 'name': 'app' + } + t2 = { + 'config': {'db': {'host': 'remotehost', 'secret': 'def'}, + 'api': {'url': 'http://api2', 'nested': {'secret': 'uvw'}}}, + 'name': 'app2' + } + diff = DeepDiff(t1, t2, include_paths=["root[**]['secret']"]) + changed = diff.get('values_changed', {}) + assert "root['config']['db']['secret']" in changed + assert "root['config']['api']['nested']['secret']" in changed + assert "root['config']['db']['host']" not in changed + assert "root['config']['api']['url']" not in changed + assert "root['name']" not in changed + + def test_include_mix_exact_and_wildcard(self): + t1 = { + 'config': {'db': {'host': 'localhost', 'secret': 'abc'}}, + 'name': 'app' + } + t2 = { + 'config': {'db': {'host': 'remotehost', 'secret': 'def'}}, + 'name': 'app2' + } + diff = DeepDiff(t1, t2, include_paths=["root[**]['secret']", "root['name']"]) + changed = diff.get('values_changed', {}) + assert "root['config']['db']['secret']" in changed + assert "root['name']" in changed + assert "root['config']['db']['host']" not in changed + + def test_include_wildcard_no_changes(self): + t1 = {'a': {'x': 1, 'y': 2}, 'b': {'x': 3, 'y': 4}} + t2 = {'a': {'x': 1, 'y': 20}, 'b': {'x': 3, 'y': 40}} + diff = DeepDiff(t1, t2, include_paths=["root[*]['x']"]) + assert diff == {} + + def test_include_wildcard_with_added_keys(self): + """When a new key is added, include_paths restricts reporting to matching paths only.""" + t1 = {'a': {'name': 'x'}} + t2 = {'a': {'name': 'y'}, 'b': {'name': 'z'}} + diff = DeepDiff(t1, t2, include_paths=["root[*]['name']"]) + changed = diff.get('values_changed', {}) + assert "root['a']['name']" in changed + # root['b'] addition is not reported because the add is at root['b'], + # not at root[*]['name'] + assert 'dictionary_item_added' not in diff + + def test_include_double_wildcard_with_nested_list(self): + t1 = {'data': [{'scores': [1, 2]}, {'scores': [3, 4]}]} + t2 = {'data': [{'scores': [1, 2]}, {'scores': [3, 5]}]} + diff = DeepDiff(t1, t2, include_paths=["root[**]['scores']"]) + changed = diff.get('values_changed', {}) + assert "root['data'][1]['scores'][1]" in changed + assert len(changed) == 1 + + +# ── Backward compatibility ─────────────────────────────────────────── + + +class TestBackwardCompatibility: + + def test_exact_exclude_paths_unchanged(self): + t1 = {"for life": "vegan", "ingredients": ["no meat", "no eggs"]} + t2 = {"for life": "vegan", "ingredients": ["veggies", "tofu"]} + ddiff = DeepDiff(t1, t2, exclude_paths={"root['ingredients']"}) + assert ddiff == {} + + def test_exact_include_paths_unchanged(self): + t1 = {"for life": "vegan", "ingredients": ["no meat", "no eggs"]} + t2 = {"for life": "vegan2", "ingredients": ["veggies", "tofu"]} + ddiff = DeepDiff(t1, t2, include_paths={"root['for life']"}) + changed = ddiff.get('values_changed', {}) + assert "root['for life']" in changed + assert len(changed) == 1 + + def test_exclude_regex_paths_unchanged(self): + t1 = [{'a': 1, 'b': 2}, {'c': 4, 'b': 5}] + t2 = [{'a': 1, 'b': 3}, {'c': 4, 'b': 5}] + ddiff = DeepDiff(t1, t2, exclude_regex_paths=[r"root\[\d+\]\['b'\]"]) + assert ddiff == {} + + def test_shorthand_paths_unchanged(self): + t1 = {"for life": "vegan", "ingredients": ["no meat"]} + t2 = {"for life": "vegan", "ingredients": ["veggies"]} + ddiff = DeepDiff(t1, t2, exclude_paths={"ingredients"}) + assert ddiff == {} + + def test_include_paths_with_nested_prefix(self): + """Existing prefix-based include logic must still work.""" + t1 = {"foo": {"bar": {"veg": "potato", "fruit": "apple"}}} + t2 = {"foo": {"bar": {"veg": "potato", "fruit": "peach"}}} + ddiff = DeepDiff(t1, t2, include_paths="root['foo']['bar']") + changed = ddiff.get('values_changed', {}) + assert "root['foo']['bar']['fruit']" in changed + + +# ── DeepSearch integration ─────────────────────────────────────────── + + +class TestDeepSearchGlob: + + def test_exclude_glob_in_search(self): + obj = {'a': {'secret': 'find_me', 'name': 'x'}, 'b': {'secret': 'find_me', 'name': 'y'}} + result = DeepSearch(obj, 'find_me', exclude_paths=["root[*]['secret']"]) + assert result == {} + + def test_exclude_deep_glob_in_search(self): + obj = {'level1': {'level2': {'target': 'needle', 'other': 'needle'}}} + result = DeepSearch(obj, 'needle', exclude_paths=["root[**]['target']"]) + matched = result.get('matched_values', {}) + assert "root['level1']['level2']['other']" in matched + assert "root['level1']['level2']['target']" not in matched + + def test_exclude_glob_via_grep(self): + obj = [{'secret': 'findme', 'name': 'x'}, {'secret': 'findme', 'name': 'y'}] + result = obj | grep('findme', exclude_paths=["root[*]['secret']"]) + assert result == {} + + def test_exclude_deep_glob_in_list_search(self): + obj = [[1, 2, 'target'], [3, 'target', 4]] + result = DeepSearch(obj, 'target', exclude_paths=["root[*][2]"]) + matched = result.get('matched_values', {}) + assert 'root[1][1]' in matched + assert 'root[0][2]' not in matched + + def test_search_with_mixed_exact_and_glob_exclude(self): + obj = {'a': 'val', 'b': {'c': 'val'}, 'd': {'e': {'f': 'val'}}} + result = DeepSearch(obj, 'val', exclude_paths=["root['a']", "root[**]['f']"]) + matched = result.get('matched_values', {}) + assert "root['b']['c']" in matched + assert "root['a']" not in matched + assert "root['d']['e']['f']" not in matched + + +# ── DeepHash integration ───────────────────────────────────────────── + + +class TestDeepHashGlob: + + def test_exclude_exact_makes_hash_equal(self): + t1 = {'name': 'app', 'secret': 'abc'} + t2 = {'name': 'app', 'secret': 'def'} + h1 = DeepHash(t1, exclude_paths=["root['secret']"]) + h2 = DeepHash(t2, exclude_paths=["root['secret']"]) + assert h1[t1] == h2[t2] + + def test_exclude_glob_wildcard_makes_hash_equal(self): + t1 = {'a': {'secret': 'x', 'name': 'n1'}, 'b': {'secret': 'y', 'name': 'n2'}} + t2 = {'a': {'secret': 'X', 'name': 'n1'}, 'b': {'secret': 'Y', 'name': 'n2'}} + h1 = DeepHash(t1, exclude_paths=["root[*]['secret']"]) + h2 = DeepHash(t2, exclude_paths=["root[*]['secret']"]) + assert h1[t1] == h2[t2] + + def test_exclude_deep_glob_makes_hash_equal(self): + t1 = {'a': {'b': {'secret': 1, 'val': 2}}} + t2 = {'a': {'b': {'secret': 99, 'val': 2}}} + h1 = DeepHash(t1, exclude_paths=["root[**]['secret']"]) + h2 = DeepHash(t2, exclude_paths=["root[**]['secret']"]) + assert h1[t1] == h2[t2] + + def test_exclude_glob_hash_not_equal_when_included_part_differs(self): + t1 = {'a': {'secret': 'x', 'name': 'n1'}} + t2 = {'a': {'secret': 'x', 'name': 'DIFFERENT'}} + h1 = DeepHash(t1, exclude_paths=["root[*]['secret']"]) + h2 = DeepHash(t2, exclude_paths=["root[*]['secret']"]) + assert h1[t1] != h2[t2] + + +# ── Edge cases ─────────────────────────────────────────────────────── + + +class TestEdgeCases: + + def test_wildcard_with_ignore_order(self): + t1 = [{'name': 'a', 'pw': '1'}, {'name': 'b', 'pw': '2'}] + t2 = [{'name': 'b', 'pw': '20'}, {'name': 'a', 'pw': '10'}] + diff = DeepDiff(t1, t2, ignore_order=True, exclude_paths=["root[*]['pw']"]) + assert diff == {} + + def test_include_wildcard_with_ignore_order(self): + t1 = [{'name': 'a', 'pw': '1'}, {'name': 'b', 'pw': '2'}] + t2 = [{'name': 'b', 'pw': '20'}, {'name': 'a', 'pw': '10'}] + diff = DeepDiff(t1, t2, ignore_order=True, include_paths=["root[*]['name']"]) + assert diff == {} + + def test_wildcard_with_added_removed_keys(self): + t1 = {'users': {'alice': {'name': 'Alice', 'pw': 'a'}}} + t2 = {'users': {'alice': {'name': 'Alice', 'pw': 'b'}, 'bob': {'name': 'Bob', 'pw': 'c'}}} + diff = DeepDiff(t1, t2, exclude_paths=["root['users'][*]['pw']"]) + added = diff.get('dictionary_item_added', []) + assert any("bob" in str(p) for p in added) + + def test_empty_diff_with_wildcard(self): + t1 = {'a': 1} + t2 = {'a': 1} + diff = DeepDiff(t1, t2, exclude_paths=["root[*]"]) + assert diff == {} + + def test_root_double_wildcard_excludes_everything(self): + t1 = {'a': 1, 'b': {'c': 2}} + t2 = {'a': 10, 'b': {'c': 20}} + diff = DeepDiff(t1, t2, exclude_paths=["root[**]"]) + assert diff == {} + + def test_wildcard_with_custom_object(self): + class Obj: + def __init__(self, name, secret): + self.name = name + self.secret = secret + o1 = Obj('a', 's1') + o2 = Obj('b', 's2') + diff = DeepDiff(o1, o2, exclude_paths=["root.secret"]) + changed = diff.get('values_changed', {}) + assert 'root.name' in changed + assert 'root.secret' not in changed + + def test_exclude_wildcard_with_removed_items(self): + t1 = [{'x': 1, 'y': 2}, {'x': 3, 'y': 4}, {'x': 5, 'y': 6}] + t2 = [{'x': 1, 'y': 2}] + diff = DeepDiff(t1, t2, exclude_paths=["root[*]['y']"]) + removed = diff.get('iterable_item_removed', {}) + assert len(removed) == 2 + + def test_wildcard_verbose_level_2(self): + t1 = {'a': {'x': 1}, 'b': {'x': 2}} + t2 = {'a': {'x': 10}, 'b': {'x': 20}} + diff = DeepDiff(t1, t2, exclude_paths=["root[*]['x']"], verbose_level=2) + assert diff == {} + + def test_multiple_wildcards_in_one_pattern(self): + t1 = {'a': {'b': {'c': 1}}, 'x': {'y': {'z': 2}}} + t2 = {'a': {'b': {'c': 10}}, 'x': {'y': {'z': 20}}} + diff = DeepDiff(t1, t2, exclude_paths=["root[*][*][*]"]) + assert diff == {} + + def test_wildcard_does_not_affect_identical_objects(self): + t1 = {'a': [1, 2, 3], 'b': {'c': 'd'}} + diff = DeepDiff(t1, t1, exclude_paths=["root[**]"]) + assert diff == {} + + def test_wildcard_as_single_exclude_path_string(self): + """exclude_paths accepts a single string, not just a list.""" + t1 = {'a': {'x': 1}, 'b': {'x': 2}} + t2 = {'a': {'x': 10}, 'b': {'x': 20}} + diff = DeepDiff(t1, t2, exclude_paths="root[*]['x']") + assert diff == {} + + def test_include_wildcard_as_single_string(self): + """include_paths accepts a single string, not just a list.""" + t1 = {'a': {'x': 1, 'y': 2}, 'b': {'x': 3, 'y': 4}} + t2 = {'a': {'x': 10, 'y': 2}, 'b': {'x': 30, 'y': 4}} + diff = DeepDiff(t1, t2, include_paths="root[*]['x']") + changed = diff.get('values_changed', {}) + assert len(changed) == 2 + assert "root['a']['y']" not in changed + + def test_literal_star_key_not_treated_as_wildcard(self): + """A dict key named '*' should be treated literally, not as a wildcard.""" + t1 = {'*': 1, 'a': 2, 'b': 3} + t2 = {'*': 10, 'a': 20, 'b': 30} + # Exclude only the literal '*' key, not all keys + diff = DeepDiff(t1, t2, exclude_paths=["root['*']"]) + changed = diff.get('values_changed', {}) + # '*' key should be excluded, but 'a' and 'b' should still show changes + assert "root['*']" not in changed + assert "root['a']" in changed + assert "root['b']" in changed + + def test_glob_matcher_literal_star_vs_wildcard(self): + """GlobPathMatcher(root['*']) should only match literal '*' key.""" + matcher = GlobPathMatcher("root['*']") + # Should NOT match arbitrary keys (that's what root[*] is for) + assert not matcher.match("root['hello']") + assert not matcher.match("root['a']") + # Should match the literal '*' key + assert matcher.match("root['*']") + + def test_exclude_takes_precedence_over_include(self): + """When a path matches both include and exclude, exclude should win.""" + t1 = {'x': 1, 'y': 2} + t2 = {'x': 10, 'y': 20} + diff = DeepDiff(t1, t2, include_paths=["root['x']"], exclude_paths=["root['x']"]) + assert diff == {} + + def test_exclude_glob_takes_precedence_over_include_glob(self): + """Exclude glob should take precedence over include glob for same path.""" + t1 = {'a': {'x': 1}, 'b': {'x': 2}} + t2 = {'a': {'x': 10}, 'b': {'x': 20}} + diff = DeepDiff(t1, t2, include_paths=["root[*]['x']"], exclude_paths=["root['a'][*]"]) + changed = diff.get('values_changed', {}) + assert "root['a']['x']" not in changed + assert "root['b']['x']" in changed + + def test_include_glob_with_custom_operator(self): + """include_glob_paths should filter custom operator reports to only matching paths.""" + from deepdiff.operator import BaseOperator + + class AlwaysReport(BaseOperator): + """Reports on dict-level comparisons, which are ancestors of the glob target.""" + def give_up_diffing(self, level, diff_instance): + diff_instance.custom_report_result( + 'custom_report', level, {'message': 'custom'}) + return True + + t1 = {'a': {'x': 1}, 'b': {'x': 2}} + t2 = {'a': {'x': 10}, 'b': {'x': 20}} + # Operator fires on dict type — so it reports at root['a'] and root['b'] level + op = AlwaysReport(types=[dict]) + diff = DeepDiff(t1, t2, include_paths=["root[*]['x']"], custom_operators=[op]) + custom = diff.get('custom_report', set()) + # root['a'] and root['b'] are ancestors of the glob pattern, not matches + # or descendants — _skip_report_for_include_glob should filter them out + assert "root['a']" not in custom + assert "root['b']" not in custom + + def test_mixed_exact_include_and_glob_include(self): + """When both exact include_paths and glob include_paths are used together, + exact matches should pass through without glob filtering.""" + t1 = {'a': {'x': 1, 'y': 2}, 'b': {'x': 3, 'y': 4}} + t2 = {'a': {'x': 10, 'y': 20}, 'b': {'x': 30, 'y': 40}} + diff = DeepDiff( + t1, t2, + include_paths=["root['a']", "root[*]['x']"], + ) + changed = diff.get('values_changed', {}) + # root['a']['y'] is covered by exact include root['a'] + assert "root['a']['y']" in changed + # root['b']['x'] is covered by glob root[*]['x'] + assert "root['b']['x']" in changed + # root['b']['y'] is NOT covered by either + assert "root['b']['y']" not in changed From f03a0961fba53d65bc43458163989c61e59a83ed Mon Sep 17 00:00:00 2001 From: Akshat Gupta Date: Sat, 28 Mar 2026 10:29:36 +0000 Subject: [PATCH 2/3] Updating diff.py --- deepdiff/diff.py | 2420 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 2002 insertions(+), 418 deletions(-) diff --git a/deepdiff/diff.py b/deepdiff/diff.py index 2ac62b5e..4a1314ef 100755 --- a/deepdiff/diff.py +++ b/deepdiff/diff.py @@ -1,484 +1,2068 @@ -import re +#!/usr/bin/env python + +# In order to run the docstrings: +# python3 -m deepdiff.diff +# You might need to run it many times since dictionaries come in different orders +# every time you run the docstrings. +# However the docstring expects it in a specific order in order to pass! +import difflib import logging -from ast import literal_eval +import types +import datetime +import uuid +from enum import Enum +from copy import deepcopy +from math import isclose as is_close +from typing import List, Dict, Callable, Union, Any, Pattern, Tuple, Optional, Set, FrozenSet, TYPE_CHECKING, Protocol, Literal +from collections.abc import Mapping, Iterable, Sequence +from collections import defaultdict +from inspect import getmembers +from itertools import zip_longest from functools import lru_cache +from deepdiff.helper import (strings, bytes_type, numbers, uuids, ListItemRemovedOrAdded, notpresent, + IndexedHash, unprocessed, add_to_frozen_set, basic_types, + convert_item_or_items_into_set_else_none, get_type, + convert_item_or_items_into_compiled_regexes_else_none, + type_is_subclass_of_type_group, type_in_type_group, get_doc, + number_to_string, datetime_normalize, KEY_TO_VAL_STR, booleans, + np_ndarray, np_floating, get_numpy_ndarray_rows, RepeatedTimer, + TEXT_VIEW, TREE_VIEW, DELTA_VIEW, COLORED_VIEW, COLORED_COMPACT_VIEW, + detailed__dict__, add_root_to_paths, + np, get_truncate_datetime, dict_, CannotCompare, ENUM_INCLUDE_KEYS, + PydanticBaseModel, Opcode, SetOrdered, ipranges, + separate_wildcard_and_exact_paths) +from deepdiff.serialization import SerializationMixin +from deepdiff.distance import DistanceMixin, logarithmic_similarity +from deepdiff.model import ( + RemapDict, ResultDict, TextResult, TreeResult, DiffLevel, + DictRelationship, AttributeRelationship, REPORT_KEYS, + SubscriptableIterableRelationship, NonSubscriptableIterableRelationship, + SetRelationship, NumpyArrayRelationship, CUSTOM_FIELD, + FORCE_DEFAULT, +) +from deepdiff.deephash import DeepHash, combine_hashes_lists +from deepdiff.base import Base +from deepdiff.lfucache import LFUCache, DummyLFU +from deepdiff.colored_view import ColoredView + +if TYPE_CHECKING: + from pytz.tzinfo import BaseTzInfo + logger = logging.getLogger(__name__) -GETATTR = 'GETATTR' -GET = 'GET' +MAX_PASSES_REACHED_MSG = ( + 'DeepDiff has reached the max number of passes of {}. ' + 'You can possibly get more accurate results by increasing the max_passes parameter.') +MAX_DIFFS_REACHED_MSG = ( + 'DeepDiff has reached the max number of diffs of {}. ' + 'You can possibly get more accurate results by increasing the max_diffs parameter.') -class _WildcardToken: - """Sentinel object for wildcard path tokens. - Using a dedicated class (instead of plain strings) ensures that a literal - dict key ``'*'`` (parsed from ``root['*']``) is never confused with the - wildcard ``*`` (parsed from ``root[*]``). - """ - def __init__(self, symbol): - self._symbol = symbol +notpresent_indexed = IndexedHash(indexes=[0], item=notpresent) - def __repr__(self): - return self._symbol +doc = get_doc('diff_doc.rst') - def __eq__(self, other): - return isinstance(other, _WildcardToken) and self._symbol == other._symbol - def __hash__(self): - return hash(('_WildcardToken', self._symbol)) +PROGRESS_MSG = "DeepDiff {} seconds in progress. Pass #{}, Diff #{}" -SINGLE_WILDCARD = _WildcardToken('*') -MULTI_WILDCARD = _WildcardToken('**') +def _report_progress(_stats: Dict[str, Any], progress_logger: Callable[[str], None], duration: float) -> None: + """ + Report the progress every few seconds. + """ + progress_logger(PROGRESS_MSG.format(duration, _stats[PASSES_COUNT], _stats[DIFF_COUNT])) + + +DISTANCE_CACHE_HIT_COUNT = 'DISTANCE CACHE HIT COUNT' +DIFF_COUNT = 'DIFF COUNT' +PASSES_COUNT = 'PASSES COUNT' +MAX_PASS_LIMIT_REACHED = 'MAX PASS LIMIT REACHED' +MAX_DIFF_LIMIT_REACHED = 'MAX DIFF LIMIT REACHED' +DISTANCE_CACHE_ENABLED = 'DISTANCE CACHE ENABLED' +PREVIOUS_DIFF_COUNT = 'PREVIOUS DIFF COUNT' +PREVIOUS_DISTANCE_CACHE_HIT_COUNT = 'PREVIOUS DISTANCE CACHE HIT COUNT' +CANT_FIND_NUMPY_MSG = 'Unable to import numpy. This must be a bug in DeepDiff since a numpy array is detected.' +INVALID_VIEW_MSG = "view parameter must be one of 'text', 'tree', 'delta', 'colored' or 'colored_compact'. But {} was passed." +CUTOFF_RANGE_ERROR_MSG = 'cutoff_distance_for_pairs needs to be a positive float max 1.' +VERBOSE_LEVEL_RANGE_MSG = 'verbose_level should be 0, 1, or 2.' +PURGE_LEVEL_RANGE_MSG = 'cache_purge_level should be 0, 1, or 2.' +_ENABLE_CACHE_EVERY_X_DIFF = '_ENABLE_CACHE_EVERY_X_DIFF' + +model_fields_set = frozenset(["model_fields_set"]) + + +# What is the threshold to consider 2 items to be pairs. Only used when ignore_order = True. +CUTOFF_DISTANCE_FOR_PAIRS_DEFAULT = 0.3 + +# What is the threshold to calculate pairs of items between 2 iterables. +# For example 2 iterables that have nothing in common, do not need their pairs to be calculated. +CUTOFF_INTERSECTION_FOR_PAIRS_DEFAULT = 0.7 + +DEEPHASH_PARAM_KEYS = ( + 'exclude_types', + 'exclude_paths', + 'exclude_glob_paths', + 'include_paths', + 'include_glob_paths', + 'exclude_regex_paths', + 'hasher', + 'significant_digits', + 'number_format_notation', + 'ignore_string_type_changes', + 'ignore_numeric_type_changes', + 'ignore_uuid_types', + 'use_enum_value', + 'ignore_type_in_groups', + 'ignore_type_subclasses', + 'ignore_string_case', + 'exclude_obj_callback', + 'ignore_private_variables', + 'encodings', + 'ignore_encoding_errors', + 'default_timezone', + 'custom_operators', +) -class PathExtractionError(ValueError): - pass +class DeepDiffProtocol(Protocol): + t1: Any + t2: Any + cutoff_distance_for_pairs: float + use_log_scale: bool + log_scale_similarity_threshold: float + view: str + math_epsilon: Optional[float] + + + +class DeepDiff(ResultDict, SerializationMixin, DistanceMixin, DeepDiffProtocol, Base): + __doc__ = doc + + CACHE_AUTO_ADJUST_THRESHOLD = 0.25 + + def __init__(self, + t1: Any, + t2: Any, + _original_type: Optional[Any]=None, + cache_purge_level: int=1, + cache_size: int=0, + cache_tuning_sample_size: int=0, + custom_operators: Optional[List[Any]] =None, + cutoff_distance_for_pairs: float=CUTOFF_DISTANCE_FOR_PAIRS_DEFAULT, + cutoff_intersection_for_pairs: float=CUTOFF_INTERSECTION_FOR_PAIRS_DEFAULT, + default_timezone:Union[datetime.timezone, "BaseTzInfo"]=datetime.timezone.utc, + encodings: Optional[List[str]]=None, + exclude_obj_callback: Optional[Callable]=None, + exclude_obj_callback_strict: Optional[Callable]=None, + exclude_paths: Union[str, List[str], Set[str], FrozenSet[str], None]=None, + exclude_regex_paths: Union[str, List[str], Pattern[str], List[Pattern[str]], None]=None, + exclude_types: Optional[List[type]]=None, + get_deep_distance: bool=False, + group_by: Union[str, Tuple[str, str], Callable, None]=None, + group_by_sort_key: Union[str, Callable, None]=None, + hasher: Optional[Callable]=None, + hashes: Optional[Dict[Any, Any]]=None, + ignore_encoding_errors: bool=False, + ignore_nan_inequality: bool=False, + ignore_numeric_type_changes: bool=False, + ignore_order: bool=False, + ignore_order_func: Optional[Callable]=None, + ignore_private_variables: bool=True, + ignore_string_case: bool=False, + ignore_string_type_changes: bool=False, + ignore_type_in_groups: Optional[List[Tuple[Any, ...]]]=None, + ignore_type_subclasses: bool=False, + ignore_uuid_types: bool=False, + include_obj_callback: Optional[Callable]=None, + include_obj_callback_strict: Optional[Callable]=None, + include_paths: Union[str, List[str], None]=None, + iterable_compare_func: Optional[Callable]=None, + log_frequency_in_sec: int=0, + log_scale_similarity_threshold: float=0.1, + log_stacktrace: bool=False, + math_epsilon: Optional[float]=None, + max_diffs: Optional[int]=None, + max_passes: int=10000000, + number_format_notation: Literal["f", "e"]="f", + number_to_string_func: Optional[Callable]=None, + progress_logger: Callable[[str], None]=logger.info, + report_repetition: bool=False, + significant_digits: Optional[int]=None, + threshold_to_diff_deeper: float = 0.33, + truncate_datetime: Optional[str]=None, + use_enum_value: bool=False, + use_log_scale: bool=False, + verbose_level: int=1, + view: str=TEXT_VIEW, + zip_ordered_iterables: bool=False, + _parameters: Optional[Dict[str, Any]]=None, + _shared_parameters: Optional[Dict[str, Any]]=None, + **kwargs): + super().__init__() + # Defaults for glob path attributes — needed for non-root instances + # that may receive _parameters without these keys. + self.exclude_glob_paths = None + self.include_glob_paths = None + if kwargs: + raise ValueError(( + "The following parameter(s) are not valid: %s\n" + "The valid parameters are ignore_order, report_repetition, significant_digits, " + "number_format_notation, exclude_paths, include_paths, exclude_types, exclude_regex_paths, ignore_type_in_groups, " + "ignore_string_type_changes, ignore_numeric_type_changes, ignore_type_subclasses, ignore_uuid_types, truncate_datetime, " + "ignore_private_variables, ignore_nan_inequality, number_to_string_func, verbose_level, " + "view, hasher, hashes, max_passes, max_diffs, zip_ordered_iterables, " + "cutoff_distance_for_pairs, cutoff_intersection_for_pairs, log_frequency_in_sec, cache_size, " + "cache_tuning_sample_size, get_deep_distance, group_by, group_by_sort_key, cache_purge_level, log_stacktrace," + "math_epsilon, iterable_compare_func, use_enum_value, _original_type, threshold_to_diff_deeper, default_timezone " + "ignore_order_func, custom_operators, encodings, ignore_encoding_errors, use_log_scale, log_scale_similarity_threshold " + "_parameters and _shared_parameters.") % ', '.join(kwargs.keys())) + + if _parameters: + self.__dict__.update(_parameters) + else: + self.custom_operators = custom_operators or [] + self.ignore_order = ignore_order + + self.ignore_order_func = ignore_order_func + + ignore_type_in_groups = ignore_type_in_groups or [] + if numbers == ignore_type_in_groups or numbers in ignore_type_in_groups: + ignore_numeric_type_changes = True + self.ignore_numeric_type_changes = ignore_numeric_type_changes + if strings == ignore_type_in_groups or strings in ignore_type_in_groups: + ignore_string_type_changes = True + # Handle ignore_uuid_types - check if uuid+str group is already in ignore_type_in_groups + uuid_str_group = (uuids[0], str) + if uuid_str_group == ignore_type_in_groups or uuid_str_group in ignore_type_in_groups: + ignore_uuid_types = True + self.ignore_uuid_types = ignore_uuid_types + self.use_enum_value = use_enum_value + self.log_scale_similarity_threshold = log_scale_similarity_threshold + self.use_log_scale = use_log_scale + self.default_timezone = default_timezone + self.log_stacktrace = log_stacktrace + self.threshold_to_diff_deeper = threshold_to_diff_deeper + self.ignore_string_type_changes = ignore_string_type_changes + self.ignore_type_in_groups = self.get_ignore_types_in_groups( + ignore_type_in_groups=ignore_type_in_groups, + ignore_string_type_changes=ignore_string_type_changes, + ignore_numeric_type_changes=ignore_numeric_type_changes, + ignore_type_subclasses=ignore_type_subclasses, + ignore_uuid_types=ignore_uuid_types) + self.report_repetition = report_repetition + _exclude_set = convert_item_or_items_into_set_else_none(exclude_paths) + _exclude_exact, self.exclude_glob_paths = separate_wildcard_and_exact_paths(_exclude_set) + self.exclude_paths = add_root_to_paths(_exclude_exact) + _include_set = convert_item_or_items_into_set_else_none(include_paths) + _include_exact, self.include_glob_paths = separate_wildcard_and_exact_paths(_include_set) + self.include_paths = add_root_to_paths(_include_exact) + self.exclude_regex_paths = convert_item_or_items_into_compiled_regexes_else_none(exclude_regex_paths) + self.exclude_types = set(exclude_types) if exclude_types else None + self.exclude_types_tuple = tuple(exclude_types) if exclude_types else None # we need tuple for checking isinstance + self.ignore_type_subclasses = ignore_type_subclasses + self.type_check_func = type_in_type_group if ignore_type_subclasses else type_is_subclass_of_type_group + self.ignore_string_case = ignore_string_case + self.exclude_obj_callback = exclude_obj_callback + self.exclude_obj_callback_strict = exclude_obj_callback_strict + self.include_obj_callback = include_obj_callback + self.include_obj_callback_strict = include_obj_callback_strict + self.number_to_string = number_to_string_func or number_to_string + self.iterable_compare_func = iterable_compare_func + self.zip_ordered_iterables = zip_ordered_iterables + self.ignore_private_variables = ignore_private_variables + self.ignore_nan_inequality = ignore_nan_inequality + self.hasher = hasher + self.cache_tuning_sample_size = cache_tuning_sample_size + self.group_by = group_by + if callable(group_by_sort_key): + self.group_by_sort_key = group_by_sort_key + elif group_by_sort_key: + def _group_by_sort_key(x): + return x[group_by_sort_key] + self.group_by_sort_key = _group_by_sort_key + else: + self.group_by_sort_key = None + self.encodings = encodings + self.ignore_encoding_errors = ignore_encoding_errors + + self.significant_digits = self.get_significant_digits(significant_digits, ignore_numeric_type_changes) + self.math_epsilon = math_epsilon + if self.math_epsilon is not None and self.ignore_order: + logger.warning("math_epsilon in conjunction with ignore_order=True is only used for flat object comparisons. Custom math_epsilon will not have an effect when comparing nested objects.") + self.truncate_datetime = get_truncate_datetime(truncate_datetime) + self.number_format_notation = number_format_notation + if verbose_level in {0, 1, 2}: + self.verbose_level = verbose_level + else: + raise ValueError(VERBOSE_LEVEL_RANGE_MSG) + if cache_purge_level not in {0, 1, 2}: + raise ValueError(PURGE_LEVEL_RANGE_MSG) + self.view = view + # Setting up the cache for dynamic programming. One dictionary per instance of root of DeepDiff running. + self.max_passes = max_passes + self.max_diffs = max_diffs + self.cutoff_distance_for_pairs = float(cutoff_distance_for_pairs) + self.cutoff_intersection_for_pairs = float(cutoff_intersection_for_pairs) + if self.cutoff_distance_for_pairs < 0 or self.cutoff_distance_for_pairs > 1: + raise ValueError(CUTOFF_RANGE_ERROR_MSG) + # _Parameters are the clean _parameters to initialize DeepDiff with so we avoid all the above + # cleaning functionalities when running DeepDiff recursively. + # However DeepHash has its own set of _parameters that are slightly different than DeepDIff. + # DeepDiff _parameters are transformed to DeepHash _parameters via _get_deephash_params method. + self.progress_logger = progress_logger + self.cache_size = cache_size + _parameters = self.__dict__.copy() + _parameters['group_by'] = None # overwriting since these parameters will be passed on to other passes. + if log_stacktrace: + self.log_err = logger.exception + else: + self.log_err = logger.error + + # Non-Root + if _shared_parameters: + self.is_root = False + self._shared_parameters = _shared_parameters + self.__dict__.update(_shared_parameters) + # We are in some pass other than root + progress_timer = None + # Root + else: + self.is_root = True + # Caching the DeepDiff results for dynamic programming + self._distance_cache = LFUCache(cache_size) if cache_size else DummyLFU() + self._stats = { + PASSES_COUNT: 0, + DIFF_COUNT: 0, + DISTANCE_CACHE_HIT_COUNT: 0, + PREVIOUS_DIFF_COUNT: 0, + PREVIOUS_DISTANCE_CACHE_HIT_COUNT: 0, + MAX_PASS_LIMIT_REACHED: False, + MAX_DIFF_LIMIT_REACHED: False, + DISTANCE_CACHE_ENABLED: bool(cache_size), + } + self.hashes = dict_() if hashes is None else hashes + self._numpy_paths = dict_() # if _numpy_paths is None else _numpy_paths + self.group_by_keys = set() # Track keys that originated from group_by operations + self._shared_parameters = { + 'hashes': self.hashes, + '_stats': self._stats, + '_distance_cache': self._distance_cache, + 'group_by_keys': self.group_by_keys, + '_numpy_paths': self._numpy_paths, + _ENABLE_CACHE_EVERY_X_DIFF: self.cache_tuning_sample_size * 10, + } + if log_frequency_in_sec: + # Creating a progress log reporter that runs in a separate thread every log_frequency_in_sec seconds. + progress_timer = RepeatedTimer(log_frequency_in_sec, _report_progress, self._stats, progress_logger) + else: + progress_timer = None + self._parameters = _parameters + self.deephash_parameters = self._get_deephash_params() + self.tree = TreeResult() + self._iterable_opcodes = {} + if group_by and self.is_root: + try: + original_t1 = t1 + t1 = self._group_iterable_to_dict(t1, group_by, item_name='t1') + except (KeyError, ValueError): + pass + else: + try: + t2 = self._group_iterable_to_dict(t2, group_by, item_name='t2') + except (KeyError, ValueError): + t1 = original_t1 + + self.t1 = t1 + self.t2 = t2 + + try: + root = DiffLevel(t1, t2, verbose_level=self.verbose_level) + # _original_type is only used to pass the original type of the data. Currently only used for numpy arrays. + # The reason is that we convert the numpy array to python list and then later for distance calculations + # we convert only the the last dimension of it into numpy arrays. + self._diff(root, parents_ids=frozenset({id(t1)}), _original_type=_original_type) + + if get_deep_distance and view in {TEXT_VIEW, TREE_VIEW}: + self.tree['deep_distance'] = self._get_rough_distance() + + self.tree.remove_empty_keys() + view_results = self._get_view_results(self.view) + if isinstance(view_results, ColoredView): + self.update(view_results.tree) + self._colored_view = view_results + else: + self.update(view_results) + finally: + if self.is_root: + if cache_purge_level: + del self._distance_cache + del self.hashes + del self._shared_parameters + del self._parameters + for key in (PREVIOUS_DIFF_COUNT, PREVIOUS_DISTANCE_CACHE_HIT_COUNT, + DISTANCE_CACHE_ENABLED): + del self._stats[key] + if progress_timer: + duration = progress_timer.stop() + self._stats['DURATION SEC'] = duration + logger.info('stats {}'.format(self.get_stats())) + if cache_purge_level == 2: + self.__dict__.clear() + + def _get_deephash_params(self): + result = {key: self._parameters.get(key) for key in DEEPHASH_PARAM_KEYS} + result['ignore_repetition'] = not self.report_repetition + result['number_to_string_func'] = self.number_to_string + return result + + def _report_result(self, report_type, change_level, local_tree=None): + """ + Add a detected change to the reference-style result dictionary. + report_type will be added to level. + (We'll create the text-style report from there later.) + :param report_type: A well defined string key describing the type of change. + Examples: "set_item_added", "values_changed" + :param change_level: A DiffLevel object describing the objects in question in their + before-change and after-change object structure. + + :local_tree: None + """ -class RootCanNotBeModified(ValueError): - pass + if not self._skip_this(change_level): + if self._skip_report_for_include_glob(change_level): + return + change_level.report_type = report_type + tree = self.tree if local_tree is None else local_tree + tree[report_type].add(change_level) + def custom_report_result(self, report_type, level, extra_info=None): + """ + Add a detected change to the reference-style result dictionary. + report_type will be added to level. + (We'll create the text-style report from there later.) + + :param report_type: A well defined string key describing the type of change. + Examples: "set_item_added", "values_changed" + :param parent: A DiffLevel object describing the objects in question in their + before-change and after-change object structure. + :param extra_info: A dict that describe this result + :rtype: None + """ -def _add_to_elements(elements, elem, inside): - # Ignore private items - if not elem: - return - if not elem.startswith('__'): - # Handle wildcard tokens (* and **) as-is. - # Unquoted root[*] arrives as bare '*' which matches the string check. - # Quoted root['*'] arrives as "'*'" which does NOT match, so it falls - # through to literal_eval and becomes the plain string '*' — which is - # distinct from the _WildcardToken sentinel and thus treated as a - # literal dict key. - if elem in ('*', '**'): - action = GETATTR if inside == '.' else GET - elements.append((SINGLE_WILDCARD if elem == '*' else MULTI_WILDCARD, action)) - return - remove_quotes = False - if '𝆺𝅥𝅯' in elem or '\\' in elem: - remove_quotes = True - else: - try: - elem = literal_eval(elem) - remove_quotes = False - except (ValueError, SyntaxError): - remove_quotes = True - if remove_quotes and elem[0] == elem[-1] and elem[0] in {'"', "'"}: - elem = elem[1: -1] - action = GETATTR if inside == '.' else GET - elements.append((elem, action)) + if not self._skip_this(level): + if self._skip_report_for_include_glob(level): + return + level.report_type = report_type + level.additional[CUSTOM_FIELD] = extra_info + self.tree[report_type].add(level) + + def _skip_report_for_include_glob(self, level): + """When include_glob_paths is set, _skip_this allows ancestors through for traversal. + This method does a stricter check at report time: only report if the path + actually matches a glob pattern or is a descendant of a matching path, + or if it already matches an exact include_path.""" + if not self.include_glob_paths: + return False + level_path = level.path() + # If exact include_paths already matched, don't skip + if self.include_paths: + if level_path in self.include_paths: + return False + for prefix in self.include_paths: + if prefix in level_path: + return False + # Check glob patterns: match or descendant + for gp in self.include_glob_paths: + if gp.match_or_is_descendant(level_path): + return False + return True + @staticmethod + def _dict_from_slots(object: Any) -> Dict[str, Any]: + def unmangle(attribute: str) -> str: + if attribute.startswith('__') and attribute != '__weakref__': + return '_{type}{attribute}'.format( + type=type(object).__name__, + attribute=attribute + ) + return attribute + + all_slots = [] + + if isinstance(object, type): + mro = object.__mro__ # pragma: no cover. I have not been able to write a test for this case. But we still check for it. + else: + mro = object.__class__.__mro__ -DEFAULT_FIRST_ELEMENT = ('root', GETATTR) + for type_in_mro in mro: + slots = getattr(type_in_mro, '__slots__', None) + if slots: + if isinstance(slots, strings): + all_slots.append(slots) + else: + all_slots.extend(slots) + + return {i: getattr(object, key) for i in all_slots if hasattr(object, key := unmangle(i))} + + def _diff_enum(self, level: Any, parents_ids: FrozenSet[int]=frozenset(), local_tree: Optional[Any]=None) -> None: + t1 = detailed__dict__(level.t1, include_keys=ENUM_INCLUDE_KEYS) + t2 = detailed__dict__(level.t2, include_keys=ENUM_INCLUDE_KEYS) + + self._diff_dict( + level, + parents_ids, + print_as_attribute=True, + override=True, + override_t1=t1, + override_t2=t2, + local_tree=local_tree, + ) + + def _diff_obj(self, level: Any, parents_ids: FrozenSet[int]=frozenset(), is_namedtuple: bool=False, local_tree: Optional[Any]=None, is_pydantic_object: bool=False) -> None: + """Difference of 2 objects""" + processing_error = False + t1: Optional[Dict[str, Any]] = None + t2: Optional[Dict[str, Any]] = None + try: + if is_namedtuple: + t1 = level.t1._asdict() + t2 = level.t2._asdict() + elif is_pydantic_object: + t1 = detailed__dict__(level.t1, ignore_private_variables=self.ignore_private_variables, ignore_keys=model_fields_set) + t2 = detailed__dict__(level.t2, ignore_private_variables=self.ignore_private_variables, ignore_keys=model_fields_set) + elif all('__dict__' in dir(t) for t in level): + t1 = detailed__dict__(level.t1, ignore_private_variables=self.ignore_private_variables) + t2 = detailed__dict__(level.t2, ignore_private_variables=self.ignore_private_variables) + elif all('__slots__' in dir(t) for t in level): + t1 = self._dict_from_slots(level.t1) + t2 = self._dict_from_slots(level.t2) + else: + t1 = {k: v for k, v in getmembers(level.t1) if not callable(v)} + t2 = {k: v for k, v in getmembers(level.t2) if not callable(v)} + except AttributeError: + processing_error = True + if processing_error is True or t1 is None or t2 is None: + self._report_result('unprocessed', level, local_tree=local_tree) + return + self._diff_dict( + level, + parents_ids, + print_as_attribute=True, + override=True, + override_t1=t1, + override_t2=t2, + local_tree=local_tree, + ) + + def _skip_this(self, level: Any) -> bool: + """ + Check whether this comparison should be skipped because one of the objects to compare meets exclusion criteria. + :rtype: bool + """ + level_path = level.path() + skip = False + if self.exclude_paths and level_path in self.exclude_paths: + skip = True + elif self.exclude_glob_paths and any(gp.match(level_path) for gp in self.exclude_glob_paths): + skip = True + if not skip and (self.include_paths or self.include_glob_paths) and level_path != 'root': + skip = True + if self.include_paths: + if level_path in self.include_paths: + skip = False + else: + for prefix in self.include_paths: + if prefix in level_path or level_path in prefix: + skip = False + break + if skip and self.include_glob_paths: + for gp in self.include_glob_paths: + if gp.match_or_is_ancestor(level_path): + skip = False + break + elif self.exclude_regex_paths and any( + [exclude_regex_path.search(level_path) for exclude_regex_path in self.exclude_regex_paths]): + skip = True + elif self.exclude_types_tuple and \ + (isinstance(level.t1, self.exclude_types_tuple) or isinstance(level.t2, self.exclude_types_tuple)): + skip = True + elif self.exclude_obj_callback and \ + (self.exclude_obj_callback(level.t1, level_path) or self.exclude_obj_callback(level.t2, level_path)): + skip = True + elif self.exclude_obj_callback_strict and \ + (self.exclude_obj_callback_strict(level.t1, level_path) and + self.exclude_obj_callback_strict(level.t2, level_path)): + skip = True + elif self.include_obj_callback and level_path != 'root': + skip = True + if (self.include_obj_callback(level.t1, level_path) or self.include_obj_callback(level.t2, level_path)): + skip = False + elif self.include_obj_callback_strict and level_path != 'root': + skip = True + if (self.include_obj_callback_strict(level.t1, level_path) and + self.include_obj_callback_strict(level.t2, level_path)): + skip = False + + return skip + + def _skip_this_key(self, level: Any, key: Any) -> bool: + # if include_paths is not set, than treet every path as included + if self.include_paths is None and self.include_glob_paths is None: + return False + key_path = "{}['{}']".format(level.path(), key) + if self.include_paths: + if key_path in self.include_paths: + return False + if level.path() in self.include_paths: + # matches e.g. level+key root['foo']['bar']['veg'] include_paths ["root['foo']['bar']"] + return False + for prefix in self.include_paths: + if key_path in prefix: + # matches as long the prefix is longer than this object key + # eg.: level+key root['foo']['bar'] matches prefix root['foo']['bar'] from include paths + # level+key root['foo'] matches prefix root['foo']['bar'] from include_paths + # level+key root['foo']['bar'] DOES NOT match root['foo'] from include_paths This needs to be handled afterwards + return False + # check if a higher level is included as a whole (=without any sublevels specified) + # matches e.g. level+key root['foo']['bar']['veg'] include_paths ["root['foo']"] + # but does not match, if it is level+key root['foo']['bar']['veg'] include_paths ["root['foo']['bar']['fruits']"] + up = level.up + while up is not None: + if up.path() in self.include_paths: + return False + up = up.up + if self.include_glob_paths: + for gp in self.include_glob_paths: + if gp.match_or_is_ancestor(key_path): + return False + return True -@lru_cache(maxsize=1024 * 128) -def _path_to_elements(path, root_element=DEFAULT_FIRST_ELEMENT): - """ - Given a path, it extracts the elements that form the path and their relevant most likely retrieval action. + def _get_clean_to_keys_mapping(self, keys: Any, level: Any) -> Dict[Any, Any]: + """ + Get a dictionary of cleaned value of keys to the keys themselves. + This is mainly used to transform the keys when the type changes of keys should be ignored. - >>> from deepdiff import _path_to_elements - >>> path = "root[4.3].b['a3']" - >>> _path_to_elements(path, root_element=None) - [(4.3, 'GET'), ('b', 'GETATTR'), ('a3', 'GET')] - """ - if isinstance(path, (tuple, list)): - return path - elements = [] - if root_element: - elements.append(root_element) - elem = '' - inside = False - prev_char = None - path = path[4:] # removing "root from the beginning" - brackets = [] - inside_quotes = False - quote_used = '' - for char in path: - if prev_char == '𝆺𝅥𝅯': - elem += char - elif char in {'"', "'"}: - elem += char - # If we are inside and the quote is not what we expected, the quote is not closing - if not(inside_quotes and quote_used != char): - inside_quotes = not inside_quotes - if inside_quotes: - quote_used = char + TODO: needs also some key conversion for groups of types other than the built-in strings and numbers. + """ + result = dict_() + for key in keys: + if self.ignore_string_type_changes and isinstance(key, bytes): + clean_key = key.decode('utf-8') + elif self.ignore_string_type_changes and isinstance(key, memoryview): + clean_key = key.tobytes().decode('utf-8') + elif self.use_enum_value and isinstance(key, Enum): + clean_key = key.value + elif isinstance(key, numbers): + # Skip type prefixing for keys that originated from group_by operations + if hasattr(self, 'group_by_keys') and key in self.group_by_keys: + if self.significant_digits is None: + clean_key = key + else: + clean_key = self.number_to_string(key, significant_digits=self.significant_digits, + number_format_notation=self.number_format_notation) # type: ignore # type: ignore else: - _add_to_elements(elements, elem, inside) - elem = '' - quote_used = '' - elif inside_quotes: - elem += char - elif char == '[': - if inside == '.': - _add_to_elements(elements, elem, inside) - inside = '[' - elem = '' - # we are already inside. The bracket is a part of the word. - elif inside == '[': - elem += char + type_ = "number" if self.ignore_numeric_type_changes else key.__class__.__name__ + if self.significant_digits is None: + clean_key = key + else: + clean_key = self.number_to_string(key, significant_digits=self.significant_digits, + number_format_notation=self.number_format_notation) # type: ignore # type: ignore + clean_key = KEY_TO_VAL_STR.format(type_, clean_key) else: - inside = '[' - brackets.append('[') - elem = '' - elif char == '.': - if inside == '[': - elem += char - elif inside == '.': - _add_to_elements(elements, elem, inside) - elem = '' + clean_key = key + if self.ignore_string_case and isinstance(clean_key, str): + clean_key = clean_key.lower() + if clean_key in result: + logger.warning(('{} and {} in {} become the same key when ignore_numeric_type_changes' + 'or ignore_numeric_type_changes are set to be true.').format( + key, result[clean_key], level.path())) else: - inside = '.' - elem = '' - elif char == ']': - if brackets and brackets[-1] == '[': - brackets.pop() - if brackets: - elem += char + result[clean_key] = key + return result + + def _diff_dict( + self, + level: Any, + parents_ids: FrozenSet[int]=frozenset([]), + print_as_attribute: bool=False, + override: bool=False, + override_t1: Optional[Any]=None, + override_t2: Optional[Any]=None, + local_tree: Optional[Any]=None, + ) -> None: + """Difference of 2 dictionaries""" + if override: + # for special stuff like custom objects and named tuples we receive preprocessed t1 and t2 + # but must not spoil the chain (=level) with it + t1 = override_t1 + t2 = override_t2 + else: + t1 = level.t1 + t2 = level.t2 + + if print_as_attribute: + item_added_key = "attribute_added" + item_removed_key = "attribute_removed" + rel_class = AttributeRelationship + else: + item_added_key = "dictionary_item_added" + item_removed_key = "dictionary_item_removed" + rel_class = DictRelationship + + if self.ignore_private_variables: + t1_keys = SetOrdered([key for key in t1 if not(isinstance(key, str) and key.startswith('__')) and not self._skip_this_key(level, key)]) + t2_keys = SetOrdered([key for key in t2 if not(isinstance(key, str) and key.startswith('__')) and not self._skip_this_key(level, key)]) + else: + t1_keys = SetOrdered([key for key in t1 if not self._skip_this_key(level, key)]) + t2_keys = SetOrdered([key for key in t2 if not self._skip_this_key(level, key)]) + if self.ignore_string_type_changes or self.ignore_numeric_type_changes or self.ignore_string_case: + t1_clean_to_keys = self._get_clean_to_keys_mapping(keys=t1_keys, level=level) + t2_clean_to_keys = self._get_clean_to_keys_mapping(keys=t2_keys, level=level) + t1_keys = SetOrdered(t1_clean_to_keys.keys()) + t2_keys = SetOrdered(t2_clean_to_keys.keys()) + else: + t1_clean_to_keys = t2_clean_to_keys = None + + t_keys_intersect = t2_keys & t1_keys + t_keys_added = t2_keys - t_keys_intersect + t_keys_removed = t1_keys - t_keys_intersect + + if self.threshold_to_diff_deeper: + if self.exclude_paths or self.exclude_glob_paths: + t_keys_union = {f"{level.path()}[{repr(key)}]" for key in (t2_keys | t1_keys)} + if self.exclude_paths: + t_keys_union -= self.exclude_paths + if self.exclude_glob_paths: + t_keys_union = {k for k in t_keys_union + if not any(gp.match(k) for gp in self.exclude_glob_paths)} + t_keys_union_len = len(t_keys_union) else: - _add_to_elements(elements, elem, inside) - elem = '' - inside = False + t_keys_union_len = len(t2_keys | t1_keys) + if t_keys_union_len > 1 and len(t_keys_intersect) / t_keys_union_len < self.threshold_to_diff_deeper: + self._report_result('values_changed', level, local_tree=local_tree) + return + + for key in t_keys_added: + if self._count_diff() is StopIteration: + return + + key = t2_clean_to_keys[key] if t2_clean_to_keys else key + change_level = level.branch_deeper( + notpresent, + t2[key], + child_relationship_class=rel_class, + child_relationship_param=key, + child_relationship_param2=key, + ) + self._report_result(item_added_key, change_level, local_tree=local_tree) + + for key in t_keys_removed: + if self._count_diff() is StopIteration: + return # pragma: no cover. This is already covered for addition. + + key = t1_clean_to_keys[key] if t1_clean_to_keys else key + change_level = level.branch_deeper( + t1[key], + notpresent, + child_relationship_class=rel_class, + child_relationship_param=key, + child_relationship_param2=key, + ) + self._report_result(item_removed_key, change_level, local_tree=local_tree) + + for key in t_keys_intersect: # key present in both dicts - need to compare values + if self._count_diff() is StopIteration: + return # pragma: no cover. This is already covered for addition. + + key1 = t1_clean_to_keys[key] if t1_clean_to_keys else key + key2 = t2_clean_to_keys[key] if t2_clean_to_keys else key + item_id = id(t1[key1]) + if parents_ids and item_id in parents_ids: + continue + parents_ids_added = add_to_frozen_set(parents_ids, item_id) + + # Go one level deeper + next_level = level.branch_deeper( + t1[key1], + t2[key2], + child_relationship_class=rel_class, + child_relationship_param=key, + child_relationship_param2=key, + ) + self._diff(next_level, parents_ids_added, local_tree=local_tree) + + def _diff_set(self, level: Any, local_tree: Optional[Any]=None) -> None: + """Difference of sets""" + t1_hashtable = self._create_hashtable(level, 't1') + t2_hashtable = self._create_hashtable(level, 't2') + + t1_hashes = set(t1_hashtable.keys()) + t2_hashes = set(t2_hashtable.keys()) + + hashes_added = t2_hashes - t1_hashes + hashes_removed = t1_hashes - t2_hashes + + items_added = [t2_hashtable[i].item for i in hashes_added] + items_removed = [t1_hashtable[i].item for i in hashes_removed] + + for item in items_added: + if self._count_diff() is StopIteration: + return # pragma: no cover. This is already covered for addition. + + change_level = level.branch_deeper( + notpresent, item, child_relationship_class=SetRelationship) + self._report_result('set_item_added', change_level, local_tree=local_tree) + + for item in items_removed: + if self._count_diff() is StopIteration: + return # pragma: no cover. This is already covered for addition. + + change_level = level.branch_deeper( + item, notpresent, child_relationship_class=SetRelationship) + self._report_result('set_item_removed', change_level, local_tree=local_tree) + + @staticmethod + def _iterables_subscriptable(t1: Any, t2: Any) -> bool: + try: + if getattr(t1, '__getitem__') and getattr(t2, '__getitem__'): + return True + else: # pragma: no cover + return False # should never happen + except AttributeError: + return False + + def _diff_iterable(self, level: Any, parents_ids: FrozenSet[int]=frozenset(), _original_type: Optional[type]=None, local_tree: Optional[Any]=None) -> None: + """Difference of iterables""" + if (self.ignore_order_func and self.ignore_order_func(level)) or self.ignore_order: + self._diff_iterable_with_deephash(level, parents_ids, _original_type=_original_type, local_tree=local_tree) else: - elem += char - prev_char = char - if elem: - _add_to_elements(elements, elem, inside) - return tuple(elements) - - -def _get_nested_obj(obj, elements, next_element=None): - for (elem, action) in elements: - check_elem(elem) - if action == GET: - obj = obj[elem] - elif action == GETATTR: - obj = getattr(obj, elem) - return obj - - -def _guess_type(elements, elem, index, next_element): - # If we are not at the last elements - if index < len(elements) - 1: - # We assume it is a nested dictionary not a nested list - return {} - if isinstance(next_element, int): - return [] - return {} - - -def check_elem(elem): - if isinstance(elem, str) and elem.startswith("__") and elem.endswith("__"): - raise ValueError("traversing dunder attributes is not allowed") - - -def _get_nested_obj_and_force(obj, elements, next_element=None): - prev_elem = None - prev_action = None - prev_obj = obj - for index, (elem, action) in enumerate(elements): - check_elem(elem) - _prev_obj = obj - if action == GET: - try: - obj = obj[elem] - prev_obj = _prev_obj - except KeyError: - obj[elem] = _guess_type(elements, elem, index, next_element) - obj = obj[elem] - prev_obj = _prev_obj - except IndexError: - if isinstance(obj, list) and isinstance(elem, int) and elem >= len(obj): - obj.extend([None] * (elem - len(obj))) - obj.append(_guess_type(elements, elem, index), next_element) - obj = obj[-1] - prev_obj = _prev_obj - elif isinstance(obj, list) and len(obj) == 0 and prev_elem: - # We ran into an empty list that should have been a dictionary - # We need to change it from an empty list to a dictionary - obj = {elem: _guess_type(elements, elem, index, next_element)} - if prev_action == GET: - prev_obj[prev_elem] = obj + self._diff_iterable_in_order(level, parents_ids, _original_type=_original_type, local_tree=local_tree) + + def _compare_in_order( + self, level, + t1_from_index=None, t1_to_index=None, + t2_from_index=None, t2_to_index=None + ) -> List[Tuple[Tuple[int, int], Tuple[Any, Any]]]: + """ + Default compare if `iterable_compare_func` is not provided. + This will compare in sequence order. + """ + if t1_from_index is None: + return [((i, i), (x, y)) for i, (x, y) in enumerate( + zip_longest( + level.t1, level.t2, fillvalue=ListItemRemovedOrAdded))] + else: + t1_chunk = level.t1[t1_from_index:t1_to_index] + t2_chunk = level.t2[t2_from_index:t2_to_index] + return [((i + t1_from_index, i + t2_from_index), (x, y)) for i, (x, y) in enumerate( + zip_longest( + t1_chunk, t2_chunk, fillvalue=ListItemRemovedOrAdded))] + + def _get_matching_pairs( + self, level, + t1_from_index=None, t1_to_index=None, + t2_from_index=None, t2_to_index=None + ) -> List[Tuple[Tuple[int, int], Tuple[Any, Any]]]: + """ + Given a level get matching pairs. This returns list of two tuples in the form: + [ + (t1 index, t2 index), (t1 item, t2 item) + ] + + This will compare using the passed in `iterable_compare_func` if available. + Default it to compare in order + """ + + if self.iterable_compare_func is None: + # Match in order if there is no compare function provided + return self._compare_in_order( + level, + t1_from_index=t1_from_index, t1_to_index=t1_to_index, + t2_from_index=t2_from_index, t2_to_index=t2_to_index, + ) + try: + matches = [] + y_matched = set() + y_index_matched = set() + for i, x in enumerate(level.t1): + x_found = False + for j, y in enumerate(level.t2): + + if(j in y_index_matched): + # This ensures a one-to-one relationship of matches from t1 to t2. + # If y this index in t2 has already been matched to another x + # it cannot have another match, so just continue. + continue + + if(self.iterable_compare_func(x, y, level)): + deep_hash = DeepHash(y, + hashes=self.hashes, + apply_hash=True, + **self.deephash_parameters, + ) + y_index_matched.add(j) + y_matched.add(deep_hash[y]) + matches.append(((i, j), (x, y))) + x_found = True + break + + if(not x_found): + matches.append(((i, -1), (x, ListItemRemovedOrAdded))) + for j, y in enumerate(level.t2): + + deep_hash = DeepHash(y, + hashes=self.hashes, + apply_hash=True, + **self.deephash_parameters, + ) + if(deep_hash[y] not in y_matched): + matches.append(((-1, j), (ListItemRemovedOrAdded, y))) + return matches + except CannotCompare: + return self._compare_in_order( + level, + t1_from_index=t1_from_index, t1_to_index=t1_to_index, + t2_from_index=t2_from_index, t2_to_index=t2_to_index + ) + + def _diff_iterable_in_order(self, level, parents_ids=frozenset(), _original_type=None, local_tree=None): + # We're handling both subscriptable and non-subscriptable iterables. Which one is it? + subscriptable = self._iterables_subscriptable(level.t1, level.t2) + if subscriptable: + child_relationship_class = SubscriptableIterableRelationship + else: + child_relationship_class = NonSubscriptableIterableRelationship + + if ( + not self.zip_ordered_iterables + and isinstance(level.t1, Sequence) + and isinstance(level.t2, Sequence) + and self._all_values_basic_hashable(level.t1) + and self._all_values_basic_hashable(level.t2) + and self.iterable_compare_func is None + ): + local_tree_pass = TreeResult() + opcodes_with_values = self._diff_ordered_iterable_by_difflib( + level, + parents_ids=parents_ids, + _original_type=_original_type, + child_relationship_class=child_relationship_class, + local_tree=local_tree_pass, + ) + # Sometimes DeepDiff's old iterable diff does a better job than DeepDiff + if len(local_tree_pass) > 1: + local_tree_pass2 = TreeResult() + self._diff_by_forming_pairs_and_comparing_one_by_one( + level, + parents_ids=parents_ids, + _original_type=_original_type, + child_relationship_class=child_relationship_class, + local_tree=local_tree_pass2, + ) + if len(local_tree_pass) >= len(local_tree_pass2): + local_tree_pass = local_tree_pass2 + else: + self._iterable_opcodes[level.path(force=FORCE_DEFAULT)] = opcodes_with_values + for report_type, levels in local_tree_pass.items(): + if levels: + self.tree[report_type] |= levels + else: + self._diff_by_forming_pairs_and_comparing_one_by_one( + level, + parents_ids=parents_ids, + _original_type=_original_type, + child_relationship_class=child_relationship_class, + local_tree=local_tree, + ) + + def _all_values_basic_hashable(self, iterable: Iterable[Any]) -> bool: + """ + Are all items basic hashable types? + Or there are custom types too? + """ + + # We don't want to exhaust a generator + if isinstance(iterable, types.GeneratorType): + return False + for item in iterable: + if not isinstance(item, basic_types): + return False + return True + + def _diff_by_forming_pairs_and_comparing_one_by_one( + self, level, local_tree, parents_ids=frozenset(), + _original_type=None, child_relationship_class=None, + t1_from_index=None, t1_to_index=None, + t2_from_index=None, t2_to_index=None, + ): + for (i, j), (x, y) in self._get_matching_pairs( + level, + t1_from_index=t1_from_index, t1_to_index=t1_to_index, + t2_from_index=t2_from_index, t2_to_index=t2_to_index + ): + if self._count_diff() is StopIteration: + return # pragma: no cover. This is already covered for addition. + + reference_param1 = i + reference_param2 = j + if y is ListItemRemovedOrAdded: # item removed completely + change_level = level.branch_deeper( + x, + notpresent, + child_relationship_class=child_relationship_class, + child_relationship_param=reference_param1, + child_relationship_param2=reference_param2, + ) + self._report_result('iterable_item_removed', change_level, local_tree=local_tree) + + elif x is ListItemRemovedOrAdded: # new item added + change_level = level.branch_deeper( + notpresent, + y, + child_relationship_class=child_relationship_class, + child_relationship_param=reference_param1, + child_relationship_param2=reference_param2, + ) + self._report_result('iterable_item_added', change_level, local_tree=local_tree) + + else: # check if item value has changed + if (i != j and ((x == y) or self.iterable_compare_func)): + # Item moved + change_level = level.branch_deeper( + x, + y, + child_relationship_class=child_relationship_class, + child_relationship_param=reference_param1, + child_relationship_param2=reference_param2 + ) + self._report_result('iterable_item_moved', change_level, local_tree=local_tree) + + if self.iterable_compare_func: + # Mark additional context denoting that we have moved an item. + # This will allow for correctly setting paths relative to t2 when using an iterable_compare_func + level.additional["moved"] = True + else: - setattr(prev_obj, prev_elem, obj) - obj = obj[elem] - elif action == GETATTR: - obj = getattr(obj, elem) - prev_obj = _prev_obj - prev_elem = elem - prev_action = action - return obj + continue + + item_id = id(x) + if parents_ids and item_id in parents_ids: + continue + parents_ids_added = add_to_frozen_set(parents_ids, item_id) + + # Go one level deeper + next_level = level.branch_deeper( + x, + y, + child_relationship_class=child_relationship_class, + child_relationship_param=reference_param1, + child_relationship_param2=reference_param2 + ) + self._diff(next_level, parents_ids_added, local_tree=local_tree) + + def _diff_ordered_iterable_by_difflib( + self, level, local_tree, parents_ids=frozenset(), _original_type=None, child_relationship_class=None, + ): + + seq = difflib.SequenceMatcher(isjunk=None, a=level.t1, b=level.t2, autojunk=False) + + opcodes = seq.get_opcodes() + opcodes_with_values = [] + + # TODO: this logic should be revisted so we detect reverse operations + # like when a replacement happens at index X and a reverse replacement happens at index Y + # in those cases we have a "iterable_item_moved" operation. + for tag, t1_from_index, t1_to_index, t2_from_index, t2_to_index in opcodes: + if tag == 'equal': + opcodes_with_values.append(Opcode( + tag, t1_from_index, t1_to_index, t2_from_index, t2_to_index, + )) + continue + # print('{:7} t1[{}:{}] --> t2[{}:{}] {!r:>8} --> {!r}'.format( + # tag, t1_from_index, t1_to_index, t2_from_index, t2_to_index, level.t1[t1_from_index:t1_to_index], level.t2[t2_from_index:t2_to_index])) + + opcodes_with_values.append(Opcode( + tag, t1_from_index, t1_to_index, t2_from_index, t2_to_index, + old_values = level.t1[t1_from_index: t1_to_index], + new_values = level.t2[t2_from_index: t2_to_index], + )) + + if tag == 'replace': + self._diff_by_forming_pairs_and_comparing_one_by_one( + level, local_tree=local_tree, parents_ids=parents_ids, + _original_type=_original_type, child_relationship_class=child_relationship_class, + t1_from_index=t1_from_index, t1_to_index=t1_to_index, + t2_from_index=t2_from_index, t2_to_index=t2_to_index, + ) + elif tag == 'delete': + for index, x in enumerate(level.t1[t1_from_index:t1_to_index]): + change_level = level.branch_deeper( + x, + notpresent, + child_relationship_class=child_relationship_class, + child_relationship_param=index + t1_from_index, + child_relationship_param2=index + t1_from_index, + ) + self._report_result('iterable_item_removed', change_level, local_tree=local_tree) + elif tag == 'insert': + for index, y in enumerate(level.t2[t2_from_index:t2_to_index]): + change_level = level.branch_deeper( + notpresent, + y, + child_relationship_class=child_relationship_class, + child_relationship_param=index + t2_from_index, + child_relationship_param2=index + t2_from_index, + ) + self._report_result('iterable_item_added', change_level, local_tree=local_tree) + return opcodes_with_values + + + def _diff_str(self, level, local_tree=None): + """Compare strings""" + if self.ignore_string_case: + level.t1 = level.t1.lower() + level.t2 = level.t2.lower() + + if type(level.t1) == type(level.t2) and level.t1 == level.t2: # NOQA + return + # do we add a diff for convenience? + do_diff = True + t1_str = level.t1 + t2_str = level.t2 -def extract(obj, path): - """ - Get the item from obj based on path. - - Example: - - >>> from deepdiff import extract - >>> obj = {1: [{'2': 'b'}, 3], 2: [4, 5]} - >>> path = "root[1][0]['2']" - >>> extract(obj, path) - 'b' - - Note that you can use extract in conjunction with DeepDiff results - or even with the search and :ref:`deepsearch_label` modules. For example: - - >>> from deepdiff import grep - >>> obj = {1: [{'2': 'b'}, 3], 2: [4, 5]} - >>> result = obj | grep(5) - >>> result - {'matched_values': ['root[2][1]']} - >>> result['matched_values'][0] - 'root[2][1]' - >>> path = result['matched_values'][0] - >>> extract(obj, path) - 5 - - - .. note:: - Note that even if DeepDiff tried gives you a path to an item in a set, - there is no such thing in Python and hence you will get an error trying - to extract that item from a set. - If you want to be able to get items from sets, use the SetOrdered module - to generate the sets. - In fact Deepdiff uses SetOrdered as a dependency. - - >>> from deepdiff import grep, extract - >>> obj = {"a", "b"} - >>> obj | grep("b") - Set item detected in the path.'set' objects do NOT support indexing. But DeepSearch will still report a path. - {'matched_values': SetOrdered(['root[0]'])} - >>> extract(obj, 'root[0]') - Traceback (most recent call last): - File "", line 1, in - File "deepdiff/deepdiff/path.py", line 126, in extract - return _get_nested_obj(obj, elements) - File "deepdiff/deepdiff/path.py", line 84, in _get_nested_obj - obj = obj[elem] - TypeError: 'set' object is not subscriptable - >>> from orderly_set import SetOrdered - >>> obj = SetOrdered(["a", "b"]) - >>> extract(obj, 'root[0]') - 'a' + if isinstance(level.t1, memoryview): + try: + t1_str = level.t1.tobytes().decode('ascii') + except UnicodeDecodeError: + do_diff = False + elif isinstance(level.t1, bytes_type): + try: + t1_str = level.t1.decode('ascii') + except UnicodeDecodeError: + do_diff = False - """ - elements = _path_to_elements(path, root_element=None) - return _get_nested_obj(obj, elements) + if isinstance(level.t2, memoryview): + try: + t2_str = level.t2.tobytes().decode('ascii') + except UnicodeDecodeError: + do_diff = False + elif isinstance(level.t2, bytes_type): + try: + t2_str = level.t2.decode('ascii') + except UnicodeDecodeError: + do_diff = False + if isinstance(level.t1, Enum): + t1_str = level.t1.value -def parse_path(path, root_element=DEFAULT_FIRST_ELEMENT, include_actions=False): - """ - Parse a path to a format that is machine readable + if isinstance(level.t2, Enum): + t2_str = level.t2.value - **Parameters** + if t1_str == t2_str: + return - path : A string - The path string such as "root[1][2]['age']" + if do_diff: + if '\n' in t1_str or isinstance(t2_str, str) and '\n' in t2_str: + diff = difflib.unified_diff( + t1_str.splitlines(), t2_str.splitlines(), lineterm='') + diff = list(diff) + if diff: + level.additional['diff'] = '\n'.join(diff) + + self._report_result('values_changed', level, local_tree=local_tree) + + def _diff_tuple(self, level, parents_ids, local_tree=None): + # Checking to see if it has _fields. Which probably means it is a named + # tuple. + try: + level.t1._asdict + # It must be a normal tuple + except AttributeError: + self._diff_iterable(level, parents_ids, local_tree=local_tree) + # We assume it is a namedtuple then + else: + self._diff_obj(level, parents_ids, is_namedtuple=True, local_tree=local_tree) - root_element: string, default='root' - What the root is called in the path. + def _add_hash(self, hashes, item_hash, item, i): + if item_hash in hashes: + hashes[item_hash].indexes.append(i) + else: + hashes[item_hash] = IndexedHash(indexes=[i], item=item) - include_actions: boolean, default=False - If True, we return the action required to retrieve the item at each element of the path. + def _create_hashtable(self, level, t): + """Create hashtable of {item_hash: (indexes, item)}""" + obj = getattr(level, t) - **Examples** + local_hashes = dict_() + for (i, item) in enumerate(obj): + try: + parent = "{}[{}]".format(level.path(), i) + # Note: in the DeepDiff we only calculate the hash of items when we have to. + # So self.hashes does not include hashes of all objects in t1 and t2. + # It only includes the ones needed when comparing iterables. + # The self.hashes dictionary gets shared between different runs of DeepHash + # So that any object that is already calculated to have a hash is not re-calculated. + deep_hash = DeepHash( + item, + hashes=self.hashes, + parent=parent, + apply_hash=True, + **self.deephash_parameters, + ) + except UnicodeDecodeError as err: + err.reason = f"Can not produce a hash for {level.path()}: {err.reason}" + raise + except NotImplementedError: + raise + # except Exception as e: # pragma: no cover + # logger.error("Can not produce a hash for %s." + # "Not counting this object.\n %s" % + # (level.path(), e)) + else: + try: + item_hash = deep_hash[item] + except KeyError: + pass + else: + if item_hash is unprocessed: # pragma: no cover + self.log_err("Item %s was not processed while hashing " + "thus not counting this object." % + level.path()) + else: + self._add_hash(hashes=local_hashes, item_hash=item_hash, item=item, i=i) + + # Also we hash the iterables themselves too so that we can later create cache keys from those hashes. + DeepHash( + obj, + hashes=self.hashes, + parent=level.path(), + apply_hash=True, + **self.deephash_parameters, + ) + return local_hashes - >>> from deepdiff import parse_path - >>> parse_path("root[1][2]['age']") - [1, 2, 'age'] - >>> parse_path("root[1][2]['age']", include_actions=True) - [{'element': 1, 'action': 'GET'}, {'element': 2, 'action': 'GET'}, {'element': 'age', 'action': 'GET'}] - >>> - >>> parse_path("root['joe'].age") - ['joe', 'age'] - >>> parse_path("root['joe'].age", include_actions=True) - [{'element': 'joe', 'action': 'GET'}, {'element': 'age', 'action': 'GETATTR'}] + @staticmethod + @lru_cache(maxsize=2028) + def _get_distance_cache_key(added_hash, removed_hash): + key1, key2 = (added_hash, removed_hash) if added_hash > removed_hash else (removed_hash, added_hash) + if isinstance(key1, int): + # If the hash function produces integers we convert them to hex values. + # This was used when the default hash function was Murmur3 128bit which produces integers. + key1 = hex(key1).encode('utf-8') + key2 = hex(key2).encode('utf-8') + elif isinstance(key1, str): + key1 = key1.encode('utf-8') + key2 = key2.encode('utf-8') + return key1 + b'--' + key2 + b'dc' + + def _get_rough_distance_of_hashed_objs( + self, added_hash, removed_hash, added_hash_obj, removed_hash_obj, _original_type=None): + # We need the rough distance between the 2 objects to see if they qualify to be pairs or not + _distance = cache_key = None + if self._stats[DISTANCE_CACHE_ENABLED]: + cache_key = self._get_distance_cache_key(added_hash, removed_hash) + if cache_key in self._distance_cache: + self._stats[DISTANCE_CACHE_HIT_COUNT] += 1 + _distance = self._distance_cache.get(cache_key) + if _distance is None: + # We can only cache the rough distance and not the actual diff result for reuse. + # The reason is that we have modified the parameters explicitly so they are different and can't + # be used for diff reporting + diff = DeepDiff( + removed_hash_obj.item, added_hash_obj.item, + _parameters=self._parameters, + _shared_parameters=self._shared_parameters, + view=DELTA_VIEW, + _original_type=_original_type, + iterable_compare_func=self.iterable_compare_func, + ) + _distance = diff._get_rough_distance() + if cache_key and self._stats[DISTANCE_CACHE_ENABLED]: + self._distance_cache.set(cache_key, value=_distance) + return _distance + + def _get_most_in_common_pairs_in_iterables( + self, hashes_added, hashes_removed, t1_hashtable, t2_hashtable, parents_ids, _original_type): + """ + Get the closest pairs between items that are removed and items that are added. - """ + returns a dictionary of hashes that are closest to each other. + The dictionary is going to be symmetrical so any key will be a value too and otherwise. - result = _path_to_elements(path, root_element=root_element) - result = iter(result) - if root_element: - next(result) # We don't want the root item - if include_actions is False: - return [i[0] for i in result] - return [{'element': i[0], 'action': i[1]} for i in result] - - -def stringify_element(param, quote_str=None): - has_quote = "'" in param - has_double_quote = '"' in param - if has_quote and has_double_quote and not quote_str: - new_param = [] - for char in param: - if char in {'"', "'"}: - new_param.append('𝆺𝅥𝅯') - new_param.append(char) - result = '"' + ''.join(new_param) + '"' - elif has_quote: - result = f'"{param}"' - elif has_double_quote: - result = f"'{param}'" - else: - result = param if quote_str is None else quote_str.format(param) - return result - - -def stringify_path(path, root_element=DEFAULT_FIRST_ELEMENT, quote_str="'{}'"): - """ - Gets the path as an string. + Note that due to the current reporting structure in DeepDiff, we don't compare an item that + was added to an item that is in both t1 and t2. - For example [1, 2, 'age'] should become - root[1][2]['age'] - """ - if not path: - return root_element[0] - result = [root_element[0]] - has_actions = False - try: - if path[0][1] in {GET, GETATTR}: - has_actions = True - except (KeyError, IndexError, TypeError): - pass - if not has_actions: - path = [(i, GET) for i in path] - path[0] = (path[0][0], root_element[1]) # The action for the first element might be a GET or GETATTR. We update the action based on the root_element. - for element, action in path: - if isinstance(element, str) and action == GET: - element = stringify_element(element, quote_str) - if action == GET: - result.append(f"[{element}]") + For example + + [{1, 2}, {4, 5, 6}] + [{1, 2}, {1, 2, 3}] + + is only compared between {4, 5, 6} and {1, 2, 3} even though technically {1, 2, 3} is + just one item different than {1, 2} + + Perhaps in future we can have a report key that is item duplicated and modified instead of just added. + """ + cache_key = None + if self._stats[DISTANCE_CACHE_ENABLED]: + cache_key = combine_hashes_lists(items=[hashes_added, hashes_removed], prefix='pairs_cache') + if cache_key in self._distance_cache: + return self._distance_cache.get(cache_key).copy() + + # A dictionary of hashes to distances and each distance to an ordered set of hashes. + # It tells us about the distance of each object from other objects. + # And the objects with the same distances are grouped together in an ordered set. + # It also includes a "max" key that is just the value of the biggest current distance in the + # most_in_common_pairs dictionary. + def defaultdict_orderedset(): + return defaultdict(SetOrdered) + most_in_common_pairs = defaultdict(defaultdict_orderedset) + pairs = dict_() + + pre_calced_distances = None + if hashes_added and hashes_removed and np and len(hashes_added) > 1 and len(hashes_removed) > 1: + # pre-calculates distances ONLY for 1D arrays whether an _original_type + # was explicitly passed or a homogeneous array is detected. + # Numpy is needed for this optimization. + pre_calced_distances = self._precalculate_numpy_arrays_distance( + hashes_added, hashes_removed, t1_hashtable, t2_hashtable, _original_type) + + if hashes_added and hashes_removed \ + and self.iterable_compare_func \ + and len(hashes_added) > 0 and len(hashes_removed) > 0: + pre_calced_distances = self._precalculate_distance_by_custom_compare_func( + hashes_added, hashes_removed, t1_hashtable, t2_hashtable, _original_type) + + for added_hash in hashes_added: + for removed_hash in hashes_removed: + added_hash_obj = t2_hashtable[added_hash] + removed_hash_obj = t1_hashtable[removed_hash] + + # Loop is detected + if id(removed_hash_obj.item) in parents_ids: + continue + + _distance = None + if pre_calced_distances: + _distance = pre_calced_distances.get("{}--{}".format(added_hash, removed_hash)) + if _distance is None: + _distance = self._get_rough_distance_of_hashed_objs( + added_hash, removed_hash, added_hash_obj, removed_hash_obj, _original_type) + # Left for future debugging + # print(f'{Fore.RED}distance of {added_hash_obj.item} and {removed_hash_obj.item}: {_distance}{Style.RESET_ALL}') + # Discard potential pairs that are too far. + if _distance >= self.cutoff_distance_for_pairs: + continue + pairs_of_item = most_in_common_pairs[added_hash] + pairs_of_item[_distance].add(removed_hash) + used_to_hashes = set() + + distances_to_from_hashes = defaultdict(SetOrdered) + for from_hash, distances_to_to_hashes in most_in_common_pairs.items(): + # del distances_to_to_hashes['max'] + for dist in distances_to_to_hashes: + distances_to_from_hashes[dist].add(from_hash) + + for dist in sorted(distances_to_from_hashes.keys()): + from_hashes = distances_to_from_hashes[dist] + while from_hashes: + from_hash = from_hashes.pop() + if from_hash not in used_to_hashes: + to_hashes = most_in_common_pairs[from_hash][dist] + while to_hashes: + to_hash = to_hashes.pop() + if to_hash not in used_to_hashes: + used_to_hashes.add(from_hash) + used_to_hashes.add(to_hash) + # Left for future debugging: + # print(f'{bcolors.FAIL}Adding {t2_hashtable[from_hash].item} as a pairs of {t1_hashtable[to_hash].item} with distance of {dist}{bcolors.ENDC}') + pairs[from_hash] = to_hash + + inverse_pairs = {v: k for k, v in pairs.items()} + pairs.update(inverse_pairs) + if cache_key and self._stats[DISTANCE_CACHE_ENABLED]: + self._distance_cache.set(cache_key, value=pairs) + return pairs.copy() + + def _diff_iterable_with_deephash(self, level, parents_ids, _original_type=None, local_tree=None): + """Diff of hashable or unhashable iterables. Only used when ignoring the order.""" + + full_t1_hashtable = self._create_hashtable(level, 't1') + full_t2_hashtable = self._create_hashtable(level, 't2') + t1_hashes = SetOrdered(full_t1_hashtable.keys()) + t2_hashes = SetOrdered(full_t2_hashtable.keys()) + hashes_added = t2_hashes - t1_hashes + hashes_removed = t1_hashes - t2_hashes + + # Deciding whether to calculate pairs or not. + if (len(hashes_added) + len(hashes_removed)) / (len(full_t1_hashtable) + len(full_t2_hashtable) + 1) > self.cutoff_intersection_for_pairs: + get_pairs = False else: - result.append(f".{element}") - return ''.join(result) + get_pairs = True + # reduce the size of hashtables + if self.report_repetition: + t1_hashtable = full_t1_hashtable + t2_hashtable = full_t2_hashtable + else: + t1_hashtable = {k: v for k, v in full_t1_hashtable.items() if k in hashes_removed} + t2_hashtable = {k: v for k, v in full_t2_hashtable.items() if k in hashes_added} + if self._stats[PASSES_COUNT] < self.max_passes and get_pairs: + self._stats[PASSES_COUNT] += 1 + pairs = self._get_most_in_common_pairs_in_iterables( + hashes_added, hashes_removed, t1_hashtable, t2_hashtable, parents_ids, _original_type) + elif get_pairs: + if not self._stats[MAX_PASS_LIMIT_REACHED]: + self._stats[MAX_PASS_LIMIT_REACHED] = True + logger.warning(MAX_PASSES_REACHED_MSG.format(self.max_passes)) + pairs = dict_() + else: + pairs = dict_() + + def get_other_pair(hash_value, in_t1=True): + """ + Gets the other paired indexed hash item to the hash_value in the pairs dictionary + in_t1: are we looking for the other pair in t1 or t2? + """ + if in_t1: + hashtable = t1_hashtable + the_other_hashes = hashes_removed + else: + hashtable = t2_hashtable + the_other_hashes = hashes_added + other = pairs.pop(hash_value, notpresent) + if other is notpresent: + other = notpresent_indexed + else: + # The pairs are symmetrical. + # removing the other direction of pair + # so it does not get used. + del pairs[other] + the_other_hashes.remove(other) + other = hashtable[other] + return other + + if self.report_repetition: + for hash_value in hashes_added: + if self._count_diff() is StopIteration: + return # pragma: no cover. This is already covered for addition (when report_repetition=False). + other = get_other_pair(hash_value) + item_id = id(other.item) + indexes = t2_hashtable[hash_value].indexes if other.item is notpresent else other.indexes + # When we report repetitions, we want the child_relationship_param2 only if there is no repetition. + # Because when there is a repetition, we report it in a different way (iterable_items_added_at_indexes for example). + # When there is no repetition, we want child_relationship_param2 so that we report the "new_path" correctly. + if len(t2_hashtable[hash_value].indexes) == 1: + index2 = t2_hashtable[hash_value].indexes[0] + else: + index2 = None + for i in indexes: + change_level = level.branch_deeper( + other.item, + t2_hashtable[hash_value].item, + child_relationship_class=SubscriptableIterableRelationship, + child_relationship_param=i, + child_relationship_param2=index2, + ) + if other.item is notpresent: + self._report_result('iterable_item_added', change_level, local_tree=local_tree) + else: + parents_ids_added = add_to_frozen_set(parents_ids, item_id) + self._diff(change_level, parents_ids_added, local_tree=local_tree) + for hash_value in hashes_removed: + if self._count_diff() is StopIteration: + return # pragma: no cover. This is already covered for addition. + other = get_other_pair(hash_value, in_t1=False) + item_id = id(other.item) + # When we report repetitions, we want the child_relationship_param2 only if there is no repetition. + # Because when there is a repetition, we report it in a different way (iterable_items_added_at_indexes for example). + # When there is no repetition, we want child_relationship_param2 so that we report the "new_path" correctly. + if other.item is notpresent or len(other.indexes > 1): + index2 = None + else: + index2 = other.indexes[0] + for i in t1_hashtable[hash_value].indexes: + change_level = level.branch_deeper( + t1_hashtable[hash_value].item, + other.item, + child_relationship_class=SubscriptableIterableRelationship, + child_relationship_param=i, + child_relationship_param2=index2, + ) + if other.item is notpresent: + self._report_result('iterable_item_removed', change_level, local_tree=local_tree) + else: + # I was not able to make a test case for the following 2 lines since the cases end up + # getting resolved above in the hashes_added calcs. However I am leaving these 2 lines + # in case things change in future. + parents_ids_added = add_to_frozen_set(parents_ids, item_id) # pragma: no cover. + self._diff(change_level, parents_ids_added, local_tree=local_tree) # pragma: no cover. + + items_intersect = t2_hashes.intersection(t1_hashes) + + for hash_value in items_intersect: + t1_indexes = t1_hashtable[hash_value].indexes + t2_indexes = t2_hashtable[hash_value].indexes + t1_indexes_len = len(t1_indexes) + t2_indexes_len = len(t2_indexes) + if t1_indexes_len != t2_indexes_len: # this is a repetition change! + # create "change" entry, keep current level untouched to handle further changes + repetition_change_level = level.branch_deeper( + t1_hashtable[hash_value].item, + t2_hashtable[hash_value].item, # nb: those are equal! + child_relationship_class=SubscriptableIterableRelationship, + child_relationship_param=t1_hashtable[hash_value] + .indexes[0]) + repetition_change_level.additional['repetition'] = RemapDict( + old_repeat=t1_indexes_len, + new_repeat=t2_indexes_len, + old_indexes=t1_indexes, + new_indexes=t2_indexes) + self._report_result('repetition_change', + repetition_change_level, local_tree=local_tree) -# Regex to detect wildcard segments in a raw path string. -# Matches [*], [**], .*, .** that are NOT inside quotes. -_WILDCARD_RE = re.compile( - r'\[\*\*?\]' # [*] or [**] - r'|\.\*\*?(?=[.\[]|$)' # .* or .** followed by . or [ or end of string -) + else: + for hash_value in hashes_added: + if self._count_diff() is StopIteration: + return + other = get_other_pair(hash_value) + item_id = id(other.item) + index = t2_hashtable[hash_value].indexes[0] if other.item is notpresent else other.indexes[0] + index2 = t2_hashtable[hash_value].indexes[0] + change_level = level.branch_deeper( + other.item, + t2_hashtable[hash_value].item, + child_relationship_class=SubscriptableIterableRelationship, + child_relationship_param=index, + child_relationship_param2=index2, + ) + if other.item is notpresent: + self._report_result('iterable_item_added', change_level, local_tree=local_tree) + else: + parents_ids_added = add_to_frozen_set(parents_ids, item_id) + self._diff(change_level, parents_ids_added, local_tree=local_tree) + + for hash_value in hashes_removed: + if self._count_diff() is StopIteration: + return # pragma: no cover. This is already covered for addition. + other = get_other_pair(hash_value, in_t1=False) + item_id = id(other.item) + index = t1_hashtable[hash_value].indexes[0] + index2 = t1_hashtable[hash_value].indexes[0] if other.item is notpresent else other.indexes[0] + change_level = level.branch_deeper( + t1_hashtable[hash_value].item, + other.item, + child_relationship_class=SubscriptableIterableRelationship, + child_relationship_param=index, + child_relationship_param2=index2, + ) + if other.item is notpresent: + self._report_result('iterable_item_removed', change_level, local_tree=local_tree) + else: + # Just like the case when report_repetition = True, these lines never run currently. + # However they will stay here in case things change in future. + parents_ids_added = add_to_frozen_set(parents_ids, item_id) # pragma: no cover. + self._diff(change_level, parents_ids_added, local_tree=local_tree) # pragma: no cover. + + def _diff_booleans(self, level, local_tree=None): + if level.t1 != level.t2: + self._report_result('values_changed', level, local_tree=local_tree) + + def _diff_numbers(self, level, local_tree=None, report_type_change=True): + """Diff Numbers""" + if report_type_change: + t1_type = "number" if self.ignore_numeric_type_changes else level.t1.__class__.__name__ + t2_type = "number" if self.ignore_numeric_type_changes else level.t2.__class__.__name__ + else: + t1_type = t2_type = '' + + if self.use_log_scale: + if not logarithmic_similarity(level.t1, level.t2, threshold=self.log_scale_similarity_threshold): + self._report_result('values_changed', level, local_tree=local_tree) + elif self.math_epsilon is not None: + if not is_close(level.t1, level.t2, abs_tol=self.math_epsilon): + self._report_result('values_changed', level, local_tree=local_tree) + elif self.significant_digits is None: + if level.t1 != level.t2: + self._report_result('values_changed', level, local_tree=local_tree) + else: + # Bernhard10: I use string formatting for comparison, to be consistent with usecases where + # data is read from files that were previously written from python and + # to be consistent with on-screen representation of numbers. + # Other options would be abs(t1-t2)<10**-self.significant_digits + # or math.is_close (python3.5+) + # Note that abs(3.25-3.251) = 0.0009999999999998899 < 0.001 + # Note also that "{:.3f}".format(1.1135) = 1.113, but "{:.3f}".format(1.11351) = 1.114 + # For Decimals, format seems to round 2.5 to 2 and 3.5 to 4 (to closest even number) + t1_s = self.number_to_string(level.t1, + significant_digits=self.significant_digits, + number_format_notation=self.number_format_notation) # type: ignore + t2_s = self.number_to_string(level.t2, + significant_digits=self.significant_digits, + number_format_notation=self.number_format_notation) # type: ignore + + t1_s = KEY_TO_VAL_STR.format(t1_type, t1_s) + t2_s = KEY_TO_VAL_STR.format(t2_type, t2_s) + if t1_s != t2_s: + self._report_result('values_changed', level, local_tree=local_tree) + + def _diff_ipranges(self, level, local_tree=None): + """Diff IP ranges""" + if str(level.t1) != str(level.t2): + self._report_result('values_changed', level, local_tree=local_tree) + + def _diff_datetime(self, level, local_tree=None): + """Diff DateTimes""" + level.t1 = datetime_normalize(self.truncate_datetime, level.t1, default_timezone=self.default_timezone) + level.t2 = datetime_normalize(self.truncate_datetime, level.t2, default_timezone=self.default_timezone) + + if level.t1 != level.t2: + self._report_result('values_changed', level, local_tree=local_tree) + + def _diff_time(self, level, local_tree=None): + """Diff DateTimes""" + if self.truncate_datetime: + level.t1 = datetime_normalize(self.truncate_datetime, level.t1, default_timezone=self.default_timezone) + level.t2 = datetime_normalize(self.truncate_datetime, level.t2, default_timezone=self.default_timezone) + + if level.t1 != level.t2: + self._report_result('values_changed', level, local_tree=local_tree) + + def _diff_uuids(self, level, local_tree=None): + """Diff UUIDs""" + if level.t1.int != level.t2.int: + self._report_result('values_changed', level, local_tree=local_tree) + + def _diff_numpy_array(self, level, parents_ids=frozenset(), local_tree=None): + """Diff numpy arrays""" + if level.path() not in self._numpy_paths: + self._numpy_paths[level.path()] = get_type(level.t2).__name__ + if np is None: + # This line should never be run. If it is ever called means the type check detected a numpy array + # which means numpy module needs to be available. So np can't be None. + raise ImportError(CANT_FIND_NUMPY_MSG) # pragma: no cover + + if (self.ignore_order_func and not self.ignore_order_func(level)) or not self.ignore_order: + # fast checks + if self.significant_digits is None: + if np.array_equal(level.t1, level.t2, equal_nan=self.ignore_nan_inequality): + return # all good + else: + try: + np.testing.assert_almost_equal(level.t1, level.t2, decimal=self.significant_digits) + except TypeError: + np.array_equal(level.t1, level.t2, equal_nan=self.ignore_nan_inequality) + except AssertionError: + pass # do detailed checking below + else: + return # all good + + # compare array meta-data + _original_type = level.t1.dtype + if level.t1.shape != level.t2.shape: + # arrays are converted to python lists so that certain features of DeepDiff can apply on them easier. + # They will be converted back to Numpy at their final dimension. + level.t1 = level.t1.tolist() + level.t2 = level.t2.tolist() + self._diff_iterable(level, parents_ids, _original_type=_original_type, local_tree=local_tree) + else: + # metadata same -- the difference is in the content + shape = level.t1.shape + dimensions = len(shape) + if dimensions == 1: + self._diff_iterable(level, parents_ids, _original_type=_original_type, local_tree=local_tree) + elif (self.ignore_order_func and self.ignore_order_func(level)) or self.ignore_order: + # arrays are converted to python lists so that certain features of DeepDiff can apply on them easier. + # They will be converted back to Numpy at their final dimension. + level.t1 = level.t1.tolist() + level.t2 = level.t2.tolist() + self._diff_iterable_with_deephash(level, parents_ids, _original_type=_original_type, local_tree=local_tree) + else: + for (t1_path, t1_row), (t2_path, t2_row) in zip( + get_numpy_ndarray_rows(level.t1, shape), + get_numpy_ndarray_rows(level.t2, shape)): + + new_level = level.branch_deeper( + t1_row, + t2_row, + child_relationship_class=NumpyArrayRelationship, + child_relationship_param=t1_path, + child_relationship_param2=t2_path, + ) + + self._diff_iterable_in_order(new_level, parents_ids, _original_type=_original_type, local_tree=local_tree) + + def _diff_types(self, level, local_tree=None): + """Diff types""" + level.report_type = 'type_changes' + self._report_result('type_changes', level, local_tree=local_tree) + + def _count_diff(self): + if (self.max_diffs is not None and self._stats[DIFF_COUNT] > self.max_diffs): + if not self._stats[MAX_DIFF_LIMIT_REACHED]: + self._stats[MAX_DIFF_LIMIT_REACHED] = True + logger.warning(MAX_DIFFS_REACHED_MSG.format(self.max_diffs)) + return StopIteration + self._stats[DIFF_COUNT] += 1 + if self.cache_size and self.cache_tuning_sample_size: + self._auto_tune_cache() + + def _auto_tune_cache(self): + take_sample = (self._stats[DIFF_COUNT] % self.cache_tuning_sample_size == 0) + if self.cache_tuning_sample_size: + if self._stats[DISTANCE_CACHE_ENABLED]: + if take_sample: + self._auto_off_cache() + # Turn on the cache once in a while + elif self._stats[DIFF_COUNT] % self._shared_parameters[_ENABLE_CACHE_EVERY_X_DIFF] == 0: + self.progress_logger('Re-enabling the distance and level caches.') + # decreasing the sampling frequency + self._shared_parameters[_ENABLE_CACHE_EVERY_X_DIFF] *= 10 + self._stats[DISTANCE_CACHE_ENABLED] = True + if take_sample: + for key in (PREVIOUS_DIFF_COUNT, PREVIOUS_DISTANCE_CACHE_HIT_COUNT): + self._stats[key] = self._stats[key[9:]] + + def _auto_off_cache(self): + """ + Auto adjust the cache based on the usage + """ + if self._stats[DISTANCE_CACHE_ENABLED]: + angle = (self._stats[DISTANCE_CACHE_HIT_COUNT] - self._stats['PREVIOUS {}'.format(DISTANCE_CACHE_HIT_COUNT)]) / (self._stats[DIFF_COUNT] - self._stats[PREVIOUS_DIFF_COUNT]) + if angle < self.CACHE_AUTO_ADJUST_THRESHOLD: + self._stats[DISTANCE_CACHE_ENABLED] = False + self.progress_logger('Due to minimal cache hits, {} is disabled.'.format('distance cache')) + def _use_custom_operator(self, level): + """ + For each level we check all custom operators. + If any one of them was a match for the level, we run the diff of the operator. + If the operator returned True, the operator must have decided these objects should not + be compared anymore. It might have already reported their results. + In that case the report will appear in the final results of this diff. + Otherwise basically the 2 objects in the level are being omitted from the results. + """ -def path_has_wildcard(path): - """Check if a path string contains wildcard segments (* or **).""" - return bool(_WILDCARD_RE.search(path)) + for operator in self.custom_operators: + if operator.match(level): + prevent_default = operator.give_up_diffing(level=level, diff_instance=self) + if prevent_default: + return True + return False -class GlobPathMatcher: - """Pre-compiled matcher for a single glob pattern path. + def _diff(self, level, parents_ids=frozenset(), _original_type=None, local_tree=None): + """ + The main diff method - Parses a pattern like ``root['users'][*]['password']`` into segments - and matches concrete path strings against it. + **parameters** - ``*`` matches exactly one path segment (any key, index, or attribute). - ``**`` matches zero or more path segments. - """ + level: the tree level or tree node + parents_ids: the ids of all the parent objects in the tree from the current node. + _original_type: If the objects had an original type that was different than what currently exists in the level.t1 and t2 + """ + if self._count_diff() is StopIteration: + return - def __init__(self, pattern_path): - self.original_pattern = pattern_path - elements = _path_to_elements(pattern_path, root_element=('root', GETATTR)) - # Skip the root element for matching - self._pattern = elements[1:] - - def match(self, path_string): - """Return True if *path_string* matches this pattern exactly.""" - elements = _path_to_elements(path_string, root_element=('root', GETATTR)) - target = elements[1:] - return self._match_segments(self._pattern, target, 0, 0) - - def match_or_is_ancestor(self, path_string): - """Return True if *path_string* matches OR is an ancestor of a potential match. - - This is needed for ``include_paths``: we must not prune a path that - could lead to a matching descendant. - """ - elements = _path_to_elements(path_string, root_element=('root', GETATTR)) - target = elements[1:] - return (self._match_segments(self._pattern, target, 0, 0) or - self._could_match_descendant(self._pattern, target, 0, 0)) - - def match_or_is_descendant(self, path_string): - """Return True if *path_string* matches OR is a descendant of a matching path. - - This checks whether the pattern matches any prefix of *path_string*, - meaning the path is "inside" a matched subtree. - """ - elements = _path_to_elements(path_string, root_element=('root', GETATTR)) - target = elements[1:] - # Check exact match first - if self._match_segments(self._pattern, target, 0, 0): - return True - # Check if any prefix of target matches (making this path a descendant) - for length in range(len(target)): - if self._match_segments(self._pattern, target[:length], 0, 0): - return True - return False + if self._use_custom_operator(level): + return - @staticmethod - def _match_segments(pattern, target, pi, ti): - """Recursive segment matcher with backtracking for ``**``.""" - while pi < len(pattern) and ti < len(target): - pat_elem = pattern[pi][0] - - if pat_elem == MULTI_WILDCARD: - # ** matches zero or more segments — try every suffix - for k in range(ti, len(target) + 1): - if GlobPathMatcher._match_segments(pattern, target, pi + 1, k): - return True - return False - elif pat_elem == SINGLE_WILDCARD: - # * matches exactly one segment regardless of value/action - pi += 1 - ti += 1 + if level.t1 is level.t2: + return + + if self._skip_this(level): + return + + report_type_change = True + if get_type(level.t1) != get_type(level.t2): + for type_group in self.ignore_type_in_groups: + if self.type_check_func(level.t1, type_group) and self.type_check_func(level.t2, type_group): + report_type_change = False + break + if self.use_enum_value and isinstance(level.t1, Enum): + level.t1 = level.t1.value + report_type_change = False + if self.use_enum_value and isinstance(level.t2, Enum): + level.t2 = level.t2.value + report_type_change = False + if report_type_change: + self._diff_types(level, local_tree=local_tree) + return + # This is an edge case where t1=None or t2=None and None is in the ignore type group. + if level.t1 is None or level.t2 is None: + self._report_result('values_changed', level, local_tree=local_tree) + return + + if self.ignore_nan_inequality and isinstance(level.t1, (float, np_floating)) and str(level.t1) == str(level.t2) == 'nan': + return + + if isinstance(level.t1, booleans): + self._diff_booleans(level, local_tree=local_tree) + + elif isinstance(level.t1, strings): + # Special handling when comparing string with UUID and ignore_uuid_types is True + if self.ignore_uuid_types and isinstance(level.t2, uuids): + try: + # Convert string to UUID for comparison + t1_uuid = uuid.UUID(level.t1) + if t1_uuid.int != level.t2.int: + self._report_result('values_changed', level, local_tree=local_tree) + except (ValueError, AttributeError): + # If string is not a valid UUID, report as changed + self._report_result('values_changed', level, local_tree=local_tree) else: - tgt_elem = target[ti][0] - if pat_elem != tgt_elem: - return False - pi += 1 - ti += 1 + self._diff_str(level, local_tree=local_tree) + + elif isinstance(level.t1, datetime.datetime): + self._diff_datetime(level, local_tree=local_tree) + + elif isinstance(level.t1, ipranges): + self._diff_ipranges(level, local_tree=local_tree) + + elif isinstance(level.t1, (datetime.date, datetime.timedelta, datetime.time)): + self._diff_time(level, local_tree=local_tree) + + elif isinstance(level.t1, uuids): + # Special handling when comparing UUID with string and ignore_uuid_types is True + if self.ignore_uuid_types and isinstance(level.t2, str): + try: + # Convert string to UUID for comparison + t2_uuid = uuid.UUID(level.t2) + if level.t1.int != t2_uuid.int: + self._report_result('values_changed', level, local_tree=local_tree) + except (ValueError, AttributeError): + # If string is not a valid UUID, report as changed + self._report_result('values_changed', level, local_tree=local_tree) + else: + self._diff_uuids(level, local_tree=local_tree) - # Consume any trailing ** (they can match zero segments) - while pi < len(pattern) and pattern[pi][0] == MULTI_WILDCARD: - pi += 1 + elif isinstance(level.t1, numbers): + self._diff_numbers(level, local_tree=local_tree, report_type_change=report_type_change) - return pi == len(pattern) and ti == len(target) + elif isinstance(level.t1, Mapping): + self._diff_dict(level, parents_ids, local_tree=local_tree) - @staticmethod - def _could_match_descendant(pattern, target, pi, ti): - """Check if *target* is a prefix that could lead to a match deeper down.""" - if ti == len(target): - # Target exhausted — it's an ancestor if pattern has remaining segments - return pi < len(pattern) + elif isinstance(level.t1, tuple): + self._diff_tuple(level, parents_ids, local_tree=local_tree) - if pi >= len(pattern): - return False + elif isinstance(level.t1, (set, frozenset, SetOrdered)): + self._diff_set(level, local_tree=local_tree) + + elif isinstance(level.t1, np_ndarray): + self._diff_numpy_array(level, parents_ids, local_tree=local_tree) - pat_elem = pattern[pi][0] + elif isinstance(level.t1, PydanticBaseModel): + self._diff_obj(level, parents_ids, local_tree=local_tree, is_pydantic_object=True) + + elif isinstance(level.t1, Iterable): + self._diff_iterable(level, parents_ids, _original_type=_original_type, local_tree=local_tree) + + elif isinstance(level.t1, Enum): + self._diff_enum(level, parents_ids, local_tree=local_tree) - if pat_elem == MULTI_WILDCARD: - return (GlobPathMatcher._could_match_descendant(pattern, target, pi + 1, ti) or - GlobPathMatcher._could_match_descendant(pattern, target, pi, ti + 1)) - elif pat_elem == SINGLE_WILDCARD: - return GlobPathMatcher._could_match_descendant(pattern, target, pi + 1, ti + 1) else: - tgt_elem = target[ti][0] - if pat_elem != tgt_elem: - return False - return GlobPathMatcher._could_match_descendant(pattern, target, pi + 1, ti + 1) + self._diff_obj(level, parents_ids) + + def _get_view_results(self, view, verbose_level=None): + """ + Get the results based on the view + """ + result = self.tree + if not self.report_repetition: # and self.is_root: + result.mutual_add_removes_to_become_value_changes() + if view == TREE_VIEW: + pass + elif view == TEXT_VIEW: + effective_verbose_level = verbose_level if verbose_level is not None else self.verbose_level + result = TextResult(tree_results=self.tree, verbose_level=effective_verbose_level) + result.remove_empty_keys() + elif view == DELTA_VIEW: + result = self._to_delta_dict(report_repetition_required=False) + elif view == COLORED_VIEW: + result = ColoredView(t2=self.t2, tree_result=self.tree, compact=False) + elif view == COLORED_COMPACT_VIEW: + result = ColoredView(t2=self.t2, tree_result=self.tree, compact=True) + else: + raise ValueError(INVALID_VIEW_MSG.format(view)) + return result + @staticmethod + def _get_key_for_group_by(row, group_by, item_name): + """ + Get the key value to group a row by, using the specified group_by parameter. + + Example + >>> row = {'first': 'John', 'middle': 'Joe', 'last': 'Smith'} + >>> DeepDiff._get_key_for_group_by(row, 'first', 't1') + 'John' + >>> nested_row = {'id': 123, 'demographics': {'names': {'first': 'John', 'middle': 'Joe', 'last': 'Smith'}}} + >>> group_by = lambda x: x['demographics']['names']['first'] + >>> DeepDiff._get_key_for_group_by(nested_row, group_by, 't1') + 'John' + + Args: + row (dict): The dictionary (row) to extract the group by key from. + group_by (str or callable): The key name or function to call to get to the key value to group by. + item_name (str): The name of the item, used for error messages. + + Returns: + str: The key value to group by. + + Raises: + KeyError: If the specified key is not found in the row. + """ + try: + if callable(group_by): + return group_by(row) + return row.pop(group_by) + except KeyError: + logger.error("Unable to group {} by {}. The key is missing in {}".format(item_name, group_by, row)) + raise + + def _group_iterable_to_dict(self, item, group_by, item_name): + """ + Convert a list of dictionaries into a dictionary of dictionaries + where the key is the value of the group_by key in each dictionary. + """ + group_by_level2 = None + if isinstance(group_by, (list, tuple)): + group_by_level1 = group_by[0] + if len(group_by) > 1: + group_by_level2 = group_by[1] + else: + group_by_level1 = group_by + if isinstance(item, Iterable) and not isinstance(item, Mapping): + result = {} + item_copy = deepcopy(item) + for row in item_copy: + if isinstance(row, Mapping): + key1 = self._get_key_for_group_by(row, group_by_level1, item_name) + # Track keys created by group_by to avoid type prefixing later + if hasattr(self, 'group_by_keys'): + self.group_by_keys.add(key1) + if group_by_level2: + key2 = self._get_key_for_group_by(row, group_by_level2, item_name) + # Track level 2 keys as well + if hasattr(self, 'group_by_keys'): + self.group_by_keys.add(key2) + if key1 not in result: + result[key1] = {} + if self.group_by_sort_key: + if key2 not in result[key1]: + result[key1][key2] = [] + result_key1_key2 = result[key1][key2] + if row not in result_key1_key2: + result_key1_key2.append(row) + else: + result[key1][key2] = row + else: + if self.group_by_sort_key: + if key1 not in result: + result[key1] = [] + if row not in result[key1]: + result[key1].append(row) + else: + result[key1] = row + else: + msg = "Unable to group {} by {} since the item {} is not a dictionary.".format(item_name, group_by_level1, row) + logger.error(msg) + raise ValueError(msg) + if self.group_by_sort_key: + if group_by_level2: + for key1, row1 in result.items(): + for key2, row in row1.items(): + row.sort(key=self.group_by_sort_key) + else: + for key, row in result.items(): + row.sort(key=self.group_by_sort_key) + return result + msg = "Unable to group {} by {}".format(item_name, group_by) + logger.error(msg) + raise ValueError(msg) + + def get_stats(self): + """ + Get some stats on internals of the DeepDiff run. + """ + return self._stats -def compile_glob_paths(paths): - """Compile a list of glob pattern strings into GlobPathMatcher objects. + @property + def affected_paths(self): + """ + Get the list of paths that were affected. + Whether a value was changed or they were added or removed. + + Example + >>> from pprint import pprint + >>> t1 = {1: 1, 2: 2, 3: [3], 4: 4} + >>> t2 = {1: 1, 2: 4, 3: [3, 4], 5: 5, 6: 6} + >>> ddiff = DeepDiff(t1, t2) + >>> pprint(ddiff, indent=4) + { 'dictionary_item_added': ['root[5]', 'root[6]'], + 'dictionary_item_removed': ['root[4]'], + 'iterable_item_added': {'root[3][1]': 4}, + 'values_changed': {'root[2]': {'new_value': 4, 'old_value': 2}}} + >>> sorted(ddiff.affected_paths) + ['root[2]', 'root[3][1]', 'root[4]', 'root[5]', 'root[6]'] + >>> sorted(ddiff.affected_root_keys) + [2, 3, 4, 5, 6] - Returns a list of ``GlobPathMatcher`` or ``None`` if *paths* is empty/None. - """ - if not paths: - return None - return [GlobPathMatcher(p) for p in paths] + """ + result = SetOrdered() + for key in REPORT_KEYS: + value = self.get(key) + if value: + if isinstance(value, SetOrdered): + result |= value + else: + result |= SetOrdered(value.keys()) + return result + + @property + def affected_root_keys(self): + """ + Get the list of root keys that were affected. + Whether a value was changed or they were added or removed. + + Example + >>> from pprint import pprint + >>> t1 = {1: 1, 2: 2, 3: [3], 4: 4} + >>> t2 = {1: 1, 2: 4, 3: [3, 4], 5: 5, 6: 6} + >>> ddiff = DeepDiff(t1, t2) + >>> pprint(ddiff, indent=4) + { 'dictionary_item_added': ['root[5]', 'root[6]'], + 'dictionary_item_removed': ['root[4]'], + 'iterable_item_added': {'root[3][1]': 4}, + 'values_changed': {'root[2]': {'new_value': 4, 'old_value': 2}}} + >>> sorted(ddiff.affected_paths) + ['root[2]', 'root[3][1]', 'root[4]', 'root[5]', 'root[6]'] + >>> sorted(ddiff.affected_root_keys) + [2, 3, 4, 5, 6] + """ + result = SetOrdered() + for key in REPORT_KEYS: + value = self.tree.get(key) + if value: + if isinstance(value, SetOrdered): + values_list = value + else: + values_list = value.keys() + for item in values_list: + root_key = item.get_root_key() + if root_key is not notpresent: + result.add(root_key) + return result + + def __str__(self): + if hasattr(self, '_colored_view') and self.view in {COLORED_VIEW, COLORED_COMPACT_VIEW}: + return str(self._colored_view) + return super().__str__() + + +if __name__ == "__main__": # pragma: no cover + import doctest + doctest.testmod() From e1e23e2ab15241da9e4fec18a6f8c59b01951822 Mon Sep 17 00:00:00 2001 From: Akshat Gupta Date: Sat, 28 Mar 2026 10:31:52 +0000 Subject: [PATCH 3/3] Updating search_doc.rst --- docs/search_doc.rst | 427 ++++++-------------------------------------- 1 file changed, 57 insertions(+), 370 deletions(-) diff --git a/docs/search_doc.rst b/docs/search_doc.rst index 7039281f..89ef333a 100644 --- a/docs/search_doc.rst +++ b/docs/search_doc.rst @@ -1,388 +1,75 @@ :orphan: -**DeepHash** +grep is a more user friendly interface for DeepSearch. It takes exactly the same arguments as DeepSearch except that you pipe the object into it instead of passing it as a parameter. -DeepHash calculates the hash of objects based on their contents in a deterministic way. -This way 2 objects with the same content should have the same hash. - -The main usage of DeepHash is to calculate the hash of otherwise unhashable objects. -For example you can use DeepHash to calculate the hash of a set or a dictionary! - -At the core of it, DeepHash is a deterministic serialization of your object into a string so it -can be passed to a hash function. By default it uses SHA256. You have the option to pass any other hashing function to be used instead. - -**Import** - >>> from deepdiff import DeepHash +It works just like grep in linux shell! **Parameters** -obj : any object, The object to be hashed based on its content. - - -apply_hash: Boolean, default = True - DeepHash at its core is doing deterministic serialization of objects into strings. - Then it hashes the string. - The only time you want the apply_hash to be False is if you want to know what - the string representation of your object is BEFORE it gets hashed. - - -exclude_types: list, default = None - List of object types to exclude from hashing. - - -exclude_paths: list, default = None - List of paths to exclude from the report. If only one item, you can pass it as a string instead of a list containing only one path. - Supports :ref:`wildcard_paths_label`: use ``[*]`` to match one segment or ``[**]`` to match any depth. - - -include_paths: list, default = None - List of the only paths to include in the report. If only one item, you can pass it as a string. - Supports :ref:`wildcard_paths_label`: use ``[*]`` to match one segment or ``[**]`` to match any depth. - - -exclude_regex_paths: list, default = None - List of string regex paths or compiled regex paths objects to exclude from the report. If only one item, you can pass it as a string instead of a list containing only one regex path. - - -exclude_obj_callback - function, default = None - A function that takes the object and its path and returns a Boolean. If True is returned, the object is excluded from the results, otherwise it is included. - This is to give the user a higher level of control than one can achieve via exclude_paths, exclude_regex_paths or other means. - - -encodings: List, default = None - Character encodings to iterate through when we convert bytes into strings. You may want to pass an explicit list of encodings in your objects if you start getting UnicodeDecodeError from DeepHash. Also check out ignore_encoding_errors if you can get away with ignoring these errors and don't want to bother with an explicit list of encodings but it will come at the price of slightly less accuracy of the final results. Example: encodings=["utf-8", "latin-1"] - - -hashes: dictionary, default = empty dictionary - A dictionary of {object or object id: object hash} to start with. - Any object that is encountered and it is already in the hashes dictionary or its id is in the hashes dictionary, - will re-use the hash that is provided by this dictionary instead of re-calculating - its hash. This is typically used when you have a series of objects to be hashed and there might be repeats of the same object. - - -hasher: function. default = DeepHash.sha256hex - hasher is the hashing function. The default is DeepHash.sha256hex. - But you can pass another hash function to it if you want. - For example a cryptographic hash function or Python's builtin hash function. - All it needs is a function that takes the input in string format and returns the hash. - - You can use it by passing: hasher=hash for Python's builtin hash. - - The following alternative is already provided: - - - hasher=DeepHash.sha1hex - - Note that prior to DeepDiff 5.2, Murmur3 was the default hash function. - But Murmur3 is removed from DeepDiff dependencies since then. - - -ignore_repetition: Boolean, default = True - If repetitions in an iterable should cause the hash of iterable to be different. - Note that the deepdiff diffing functionality lets this to be the default at all times. - But if you are using DeepHash directly, you can set this parameter. - +item : The item to search for -ignore_type_in_groups - Ignore type changes between members of groups of types. For example if you want to ignore type changes between float and decimals etc. Note that this is a more granular feature. Most of the times the shortcuts provided to you are enough. - The shortcuts are ignore_string_type_changes which by default is False and ignore_numeric_type_changes which is by default False. You can read more about those shortcuts in this page. ignore_type_in_groups gives you more control compared to the shortcuts. +verbose_level : int >= 0, default = 1. + Verbose level one shows the paths of found items. + Verbose level 2 shows the path and value of the found items. - For example lets say you have specifically str and byte datatypes to be ignored for type changes. Then you have a couple of options: +exclude_paths: list, default = None. + List of paths to exclude from the report. + Supports wildcard patterns: use ``[*]`` to match one segment or ``[**]`` to match any depth. - 1. Set ignore_string_type_changes=True which is the default. - 2. Set ignore_type_in_groups=[(str, bytes)]. Here you are saying if we detect one type to be str and the other one bytes, do not report them as type change. It is exactly as passing ignore_type_in_groups=[DeepDiff.strings] or ignore_type_in_groups=DeepDiff.strings . +exclude_types: list, default = None. + List of object types to exclude from the report. - Now what if you want also typeA and typeB to be ignored when comparing agains each other? +case_sensitive: Boolean, default = False - 1. ignore_type_in_groups=[DeepDiff.strings, (typeA, typeB)] - 2. or ignore_type_in_groups=[(str, bytes), (typeA, typeB)] +match_string: Boolean, default = False + If True, the value of the object or its children have to exactly match the item. + If False, the value of the item can be a part of the value of the object or its children -ignore_string_type_changes: Boolean, default = True - string type conversions should not affect the hash output when this is set to True. - For example "Hello" and b"Hello" should produce the same hash. +use_regexp: Boolean, default = False - By setting it to True, both the string and bytes of hello return the same hash. - - -ignore_numeric_type_changes: Boolean, default = False - numeric type conversions should not affect the hash output when this is set to True. - For example 10, 10.0 and Decimal(10) should produce the same hash. - When ignore_numeric_type_changes is set to True, all numbers are converted - to strings with the precision of significant_digits parameter and number_format_notation notation. - If no significant_digits is passed by the user, a default value of 12 is used. - - -ignore_type_subclasses - Use ignore_type_subclasses=True so when ignoring type (class), the subclasses of that class are ignored too. - - -ignore_string_case - Whether to be case-sensitive or not when comparing strings. By settings ignore_string_case=False, strings will be compared case-insensitively. - - -ignore_private_variables: Boolean, default = True - Whether to exclude the private variables in the calculations or not. It only affects variables that start with double underscores (__). - - -ignore_encoding_errors: Boolean, default = False - If you want to get away with UnicodeDecodeError without passing explicit character encodings, set this option to True. If you want to make sure the encoding is done properly, keep this as False and instead pass an explicit list of character encodings to be considered via the encodings parameter. - -ignore_iterable_order: Boolean, default = True - If order of items in an iterable should not cause the hash of the iterable to be different. - -number_format_notation : string, default="f" - number_format_notation is what defines the meaning of significant digits. The default value of "f" means the digits AFTER the decimal point. "f" stands for fixed point. The other option is "e" which stands for exponent notation or scientific notation. - - -significant_digits : int >= 0, default=None - By default the significant_digits compares only that many digits AFTER the decimal point. However you can set override that by setting the number_format_notation="e" which will make it mean the digits in scientific notation. - - Important: This will affect ANY number comparison when it is set. - - Note: If ignore_numeric_type_changes is set to True and you have left significant_digits to the default of None, it gets automatically set to 12. The reason is that normally when numbers from 2 different types are compared, instead of comparing the values, we only report the type change. However when ignore_numeric_type_changes=True, in order compare numbers from different types to each other, we need to convert them all into strings. The significant_digits will be used to make sure we accurately convert all the numbers into strings in order to report the changes between them. - - Internally it uses "{:.Xf}".format(Your Number) to compare numbers where X=significant_digits when the number_format_notation is left as the default of "f" meaning fixed point. - - Note that "{:.3f}".format(1.1135) = 1.113, but "{:.3f}".format(1.11351) = 1.114 - - For Decimals, Python's format rounds 2.5 to 2 and 3.5 to 4 (to the closest even number) - - When you set the number_format_notation="e", we use "{:.Xe}".format(Your Number) where X=significant_digits. - -truncate_datetime: string, default = None - Can take value one of 'second', 'minute', 'hour', 'day' and truncate with this value datetime objects before hashing it - - - -**Returns** - A dictionary of {item: item hash}. - If your object is nested, it will build hashes of all the objects it contains too. - - -.. note:: - DeepHash output is not like conventional hash functions. It is a dictionary of object IDs to their hashes. This happens because DeepHash calculates the hash of the object and any other objects found within the object in a recursive manner. If you only need the hash of the object you are passing, all you need to do is to do: - - >>> from deepdiff import DeepHash - >>> obj = {1: 2, 'a': 'b'} - >>> DeepHash(obj)[obj] # doctest: +SKIP +strict_checking: Boolean, default = True + If True, it will check the type of the object to match, so when searching for '1234', + it will NOT match the int 1234. Currently this only affects the numeric values searching. **Examples** -Let's say you have a dictionary object. - >>> from deepdiff import DeepHash - >>> obj = {1: 2, 'a': 'b'} - -If you try to hash it: - >>> hash(obj) - Traceback (most recent call last): - File "", line 1, in - TypeError: unhashable type: 'dict' - -But with DeepHash: - - >>> from deepdiff import DeepHash - >>> obj = {1: 2, 'a': 'b'} - >>> DeepHash(obj) # doctest: +SKIP - - So what is exactly the hash of obj in this case? - DeepHash is calculating the hash of the obj and any other object that obj contains. - The output of DeepHash is a dictionary of object IDs to their hashes. - In order to get the hash of obj itself, you need to use the object (or the id of object) to get its hash: - - >>> hashes = DeepHash(obj) - >>> hashes[obj] - 'bf5478de322aa033da36bf3bcf9f0599e13a520773f50c6eb9f2487377a7929b' - - Which you can write as: - - >>> hashes = DeepHash(obj)[obj] - - At first it might seem weird why DeepHash(obj)[obj] but remember that DeepHash(obj) is a dictionary of hashes of all other objects that obj contains too. - - If you prefer to use another hashing algorithm, you can pass it using the hasher parameter. - - If you do a deep copy of the obj, it should still give you the same hash: - - >>> from copy import deepcopy - >>> obj2 = deepcopy(obj) - >>> DeepHash(obj2)[obj2] - 'bf5478de322aa033da36bf3bcf9f0599e13a520773f50c6eb9f2487377a7929b' - - Note that by default DeepHash will include string type differences. So if your strings were bytes: - - >>> obj3 = {1: 2, b'a': b'b'} - >>> DeepHash(obj3)[obj3] - '71db3231177d49f78b52a356ca206e6179417b681604d00ed703a077049e3300' - - But if you want the same hash if string types are different, set ignore_string_type_changes to True: - - >>> DeepHash(obj3, ignore_string_type_changes=True)[obj3] - 'e60c2befb84be625037c75e1e26d0bfc85a0ffc1f3cde9500f68f6eac55e5ad6' - - ignore_numeric_type_changes is by default False too. - - >>> from decimal import Decimal - >>> obj1 = {4:10} - >>> obj2 = {4.0: Decimal(10.0)} - >>> DeepHash(obj1)[4] == DeepHash(obj2)[4.0] - False - - But by setting it to True, we can get the same hash. - - >>> DeepHash(obj1, ignore_numeric_type_changes=True)[4] == DeepHash(obj2, ignore_numeric_type_changes=True)[4.0] - True - -number_format_notation: String, default = "f" - number_format_notation is what defines the meaning of significant digits. The default value of "f" means the digits AFTER the decimal point. "f" stands for fixed point. The other option is "e" which stands for exponent notation or scientific notation. - - -ignore_string_type_changes: Boolean, default = True - By setting it to True, both the string and bytes of hello return the same hash. - - >>> DeepHash(b'hello', ignore_string_type_changes=True)[b'hello'] - '2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824' - >>> DeepHash('hello', ignore_string_type_changes=True)['hello'] - '2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824' - - -ignore_numeric_type_changes: Boolean, default = False - For example if significant_digits=5, 1.1, Decimal(1.1) are both converted to 1.10000 - - That way they both produce the same hash. - - >>> t1 = {1: 1, 2: 2.22} - >>> DeepHash(t1)[1] - 'c1800a30c736483f13615542e7096f7973631fef8ca935ee1ed9f35fb06fd44e' - >>> DeepHash(t1, ignore_numeric_type_changes=True)[1] == DeepHash(t1, ignore_numeric_type_changes=True)[1.0] - True - - You can pass a list of tuples or list of lists if you have various type groups. When t1 and t2 both fall under one of these type groups, the type change will be ignored. DeepDiff already comes with 2 groups: DeepDiff.strings and DeepDiff.numbers . If you want to pass both: - - >>> from deepdiff import DeepDiff - >>> ignore_type_in_groups = [DeepDiff.strings, DeepDiff.numbers] - - -ignore_type_in_groups example with custom objects: - - >>> class Burrito: - ... bread = 'flour' - ... def __init__(self): - ... self.spicy = True - ... - >>> - >>> class Taco: - ... bread = 'flour' - ... def __init__(self): - ... self.spicy = True - ... - >>> - >>> burrito = Burrito() - >>> taco = Taco() - >>> - >>> burritos = [burrito] - >>> tacos = [taco] - >>> - >>> d1 = DeepHash(burritos, ignore_type_in_groups=[(Taco, Burrito)]) - >>> d2 = DeepHash(tacos, ignore_type_in_groups=[(Taco, Burrito)]) - >>> d1[burrito] == d2[taco] - True - - -ignore_type_subclasses - Use ignore_type_subclasses=True so when ignoring type (class), the subclasses of that class are ignored too. - - >>> from deepdiff import DeepHash - >>> - >>> class ClassB: - ... def __init__(self, x): - ... self.x = x - ... def __repr__(self): - ... return "obj b" - ... - >>> - >>> class ClassC(ClassB): - ... def __repr__(self): - ... return "obj c" - ... - >>> obj_b = ClassB(1) - >>> obj_c = ClassC(1) - >>> - >>> # By default, subclasses are considered part of the type group. - ... # ignore_type_in_groups=[(ClassB, )] matches ClassC too since it's a subclass. - ... hashes_b = DeepHash(obj_b, ignore_type_in_groups=[(ClassB, )]) - >>> hashes_c = DeepHash(obj_c, ignore_type_in_groups=[(ClassB, )]) - >>> hashes_b[obj_b] == hashes_c[obj_c] - True - >>> - >>> # With ignore_type_subclasses=True, only exact type matches count. - ... # ClassC no longer matches (ClassB, ) group, so hashes differ. - ... hashes_b = DeepHash(obj_b, ignore_type_in_groups=[(ClassB, )], ignore_type_subclasses=True) - >>> hashes_c = DeepHash(obj_c, ignore_type_in_groups=[(ClassB, )], ignore_type_subclasses=True) - >>> hashes_b[obj_b] != hashes_c[obj_c] - True - -ignore_string_case - Whether to be case-sensitive or not when comparing strings. By settings ignore_string_case=False, strings will be compared case-insensitively. - - >>> from deepdiff import DeepHash - >>> DeepHash('hello')['hello'] == DeepHash('heLLO')['heLLO'] - False - >>> DeepHash('hello', ignore_string_case=True)['hello'] == DeepHash('heLLO', ignore_string_case=True)['heLLO'] - True - -exclude_obj_callback - function, default = None - A function that takes the object and its path and returns a Boolean. If True is returned, the object is excluded from the results, otherwise it is included. - This is to give the user a higher level of control than one can achieve via exclude_paths, exclude_regex_paths or other means. - - >>> def exclude_obj_callback(obj, path): - ... return True if isinstance(obj, str) and obj in ('x', 'y') else False - ... - >>> dic1 = {"x": 1, "y": 2, "z": 3} - >>> t1 = [dic1] - >>> t1_hash = DeepHash(t1, exclude_obj_callback=exclude_obj_callback) - >>> - >>> dic2 = {"z": 3} - >>> t2 = [dic2] - >>> t2_hash = DeepHash(t2, exclude_obj_callback=exclude_obj_callback) - >>> - >>> t1_hash[t1] == t2_hash[t2] - True - -number_format_notation : string, default="f" - When numbers are converted to the string, you have the choices between "f" as fixed point and "e" as scientific notation: - - >>> t1=10002 - >>> t2=10004 - >>> t1_hash = DeepHash(t1, significant_digits=3, number_format_notation="f") - >>> t2_hash = DeepHash(t2, significant_digits=3, number_format_notation="f") - >>> - >>> t1_hash[t1] == t2_hash[t2] - False - >>> - >>> - >>> # Now we use the scientific notation - ... t1_hash = DeepHash(t1, significant_digits=3, number_format_notation="e") - >>> t2_hash = DeepHash(t2, significant_digits=3, number_format_notation="e") +Importing + >>> from deepdiff import grep + >>> from pprint import pprint + +Search in list for string + >>> obj = ["long somewhere", "string", 0, "somewhere great!"] + >>> item = "somewhere" + >>> ds = obj | grep(item) + >>> print(ds) + {'matched_values': ['root[0]', 'root[3]']} + +Search in nested data for string + >>> obj = ["something somewhere", {"long": "somewhere", "string": 2, 0: 0, "somewhere": "around"}] + >>> item = "somewhere" + >>> ds = obj | grep(item, verbose_level=2) + >>> pprint(ds, indent=2) + { 'matched_paths': {"root[1]['somewhere']": 'around'}, + 'matched_values': { 'root[0]': 'something somewhere', + "root[1]['long']": 'somewhere'}} + +You can also use regular expressions + >>> obj = ["something here", {"long": "somewhere", "someone": 2, 0: 0, "somewhere": "around"}] + >>> ds = obj | grep("some.*", use_regexp=True) + >>> pprint(ds, indent=2) + { 'matched_paths': ["root[1]['someone']", "root[1]['somewhere']"], + 'matched_values': ['root[0]', "root[1]['long']"]} + + +Change strict_checking to False to match numbers in strings and vice versa: + >>> obj = {"long": "somewhere", "num": 1123456, 0: 0, "somewhere": "around"} + >>> item = "1234" + >>> result = {"matched_values": {"root['num']"}} + >>> ds = obj | grep(item, verbose_level=1, use_regexp=True) + >>> pprint(ds) + {} >>> - >>> t1_hash[t1] == t2_hash[t2] - True - -Defining your own number_to_string_func - Lets say you want the hash of numbers below 100 to be the same for some reason. - - >>> from deepdiff import DeepHash - >>> from deepdiff.helper import number_to_string - >>> def custom_number_to_string(number, *args, **kwargs): - ... number = 100 if number < 100 else number - ... return number_to_string(number, *args, **kwargs) - ... - >>> t1 = [10, 12, 100000] - >>> t2 = [50, 63, 100021] - >>> t1_hash = DeepHash(t1, significant_digits=3, number_format_notation="e", number_to_string_func=custom_number_to_string) - >>> t2_hash = DeepHash(t2, significant_digits=3, number_format_notation="e", number_to_string_func=custom_number_to_string) - >>> t1_hash[t1] == t2_hash[t2] - True - - So both lists produced the same hash thanks to the low significant digits for 100000 vs 100021 and also the custom_number_to_string that converted all numbers below 100 to be 100! + >>> ds = obj | grep(item, verbose_level=1, use_regexp=True, strict_checking=False) + >>> pprint(ds) + {'matched_values': ["root['num']"]}