From a025cef38121f2f219908ce70f66d8e1cde99178 Mon Sep 17 00:00:00 2001 From: Sanjana Garg Date: Wed, 12 Apr 2023 13:47:59 -0500 Subject: [PATCH 1/4] Added rtd preview action --- .github/pull-request-links.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 .github/pull-request-links.yaml diff --git a/.github/pull-request-links.yaml b/.github/pull-request-links.yaml new file mode 100644 index 00000000..7a552123 --- /dev/null +++ b/.github/pull-request-links.yaml @@ -0,0 +1,16 @@ +name: readthedocs/actions +on: + pull_request_target: + types: + - opened + +permissions: + pull-requests: write + +jobs: + pull-request-links: + runs-on: ubuntu-latest + steps: + - uses: readthedocs/actions/preview@v1 + with: + project-slug: "cleanvision" From da6f4ba778d5d25d34b8b69af45ba8c248c28cad Mon Sep 17 00:00:00 2001 From: Sanjana Garg Date: Wed, 12 Apr 2023 14:46:51 -0500 Subject: [PATCH 2/4] Updated documentation --- docs/source/cleanvision/issue_managers/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/cleanvision/issue_managers/index.rst b/docs/source/cleanvision/issue_managers/index.rst index 88657fcc..5a374b76 100644 --- a/docs/source/cleanvision/issue_managers/index.rst +++ b/docs/source/cleanvision/issue_managers/index.rst @@ -1,5 +1,6 @@ Issue Managers ============== +Contains modules for managing data issues of a particular type in Imagelab. .. automodule:: cleanvision.issue_managers :autosummary: From c6902d9d2108ba8af2794785a55dbe222310a497 Mon Sep 17 00:00:00 2001 From: Sanjana Garg Date: Wed, 12 Apr 2023 14:52:08 -0500 Subject: [PATCH 3/4] Updated documentation --- .../issue_managers/duplicate_issue_manager.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/cleanvision/issue_managers/duplicate_issue_manager.py b/src/cleanvision/issue_managers/duplicate_issue_manager.py index 81a1fe76..df45ab2d 100644 --- a/src/cleanvision/issue_managers/duplicate_issue_manager.py +++ b/src/cleanvision/issue_managers/duplicate_issue_manager.py @@ -47,6 +47,18 @@ def compute_hash_wrapper(args: Dict[str, Any]) -> Dict[str, Union[str, int]]: @register_issue_manager(DUPLICATE) class DuplicateIssueManager(IssueManager): + """Checks for exact and near duplicates in images. + + Attributes + ---------- + issue_name: str + Name of the issue that issue manager handles + visualization: str + Type of visualization for issues detected by `DuplicateIssueManager` + + + """ + issue_name: str = DUPLICATE visualization: str = "image_sets" From 936dcb5474d08a27ca604df578736d0c177df37c Mon Sep 17 00:00:00 2001 From: Sanjana Garg Date: Wed, 12 Apr 2023 15:38:00 -0500 Subject: [PATCH 4/4] Updated documentation --- examples/custom_issue_manager.py | 10 ++--- .../issue_managers/duplicate_issue_manager.py | 41 ++++++++++++------- .../image_property_issue_manager.py | 8 ++-- src/cleanvision/utils/base_issue_manager.py | 4 +- tests/test_duplicate_issue_manager.py | 2 +- tests/test_image_property_issue_manager.py | 2 +- 6 files changed, 39 insertions(+), 28 deletions(-) diff --git a/examples/custom_issue_manager.py b/examples/custom_issue_manager.py index df737733..1afa0837 100644 --- a/examples/custom_issue_manager.py +++ b/examples/custom_issue_manager.py @@ -25,13 +25,13 @@ class CustomIssueManager(IssueManager): def __init__(self) -> None: super().__init__() - self.params = self.get_default_params() + self.params = self._get_default_params() - def get_default_params(self) -> Dict[str, Any]: + def _get_default_params(self) -> Dict[str, Any]: return {"threshold": 0.4} - def update_params(self, params: Dict[str, Any]) -> None: - self.params = self.get_default_params() + def _update_params(self, params: Dict[str, Any]) -> None: + self.params = self._get_default_params() non_none_params = {k: v for k, v in params.items() if v is not None} self.params = {**self.params, **non_none_params} @@ -65,7 +65,7 @@ def find_issues( assert imagelab_info is not None assert dataset is not None - self.update_params(params) + self._update_params(params) raw_scores = [] for idx in tqdm(dataset.index): diff --git a/src/cleanvision/issue_managers/duplicate_issue_manager.py b/src/cleanvision/issue_managers/duplicate_issue_manager.py index df45ab2d..0dc326c0 100644 --- a/src/cleanvision/issue_managers/duplicate_issue_manager.py +++ b/src/cleanvision/issue_managers/duplicate_issue_manager.py @@ -47,17 +47,7 @@ def compute_hash_wrapper(args: Dict[str, Any]) -> Dict[str, Union[str, int]]: @register_issue_manager(DUPLICATE) class DuplicateIssueManager(IssueManager): - """Checks for exact and near duplicates in images. - - Attributes - ---------- - issue_name: str - Name of the issue that issue manager handles - visualization: str - Type of visualization for issues detected by `DuplicateIssueManager` - - - """ + """Checks for exact and near duplicates in images.""" issue_name: str = DUPLICATE visualization: str = "image_sets" @@ -65,15 +55,15 @@ class DuplicateIssueManager(IssueManager): def __init__(self) -> None: super().__init__() self.issue_types: List[str] = [] - self.params = self.get_default_params() + self.params = self._get_default_params() - def get_default_params(self) -> Dict[str, Any]: + def _get_default_params(self) -> Dict[str, Any]: return { IssueType.EXACT_DUPLICATES.value: {"hash_type": "md5"}, IssueType.NEAR_DUPLICATES.value: {"hash_type": "phash", "hash_size": 8}, } - def update_params(self, params: Dict[str, Any]) -> None: + def _update_params(self, params: Dict[str, Any]) -> None: for issue_type in self.params: non_none_params = { k: v for k, v in params.get(issue_type, {}).items() if v is not None @@ -114,13 +104,34 @@ def find_issues( n_jobs: Optional[int] = None, **kwargs: Any, ) -> None: + """Finds exact and near duplicates in the images + + Parameters + ---------- + params: Dict[str, Any], optional + Dict of custom hyperparameters for checking duplicate issues. Default value is empty. + dataset: Dataset + Dataset object on which to run the duplicate checks + imagelab_info: Dict[str, Any] + imagelab.info dict containing computations for reuse + n_jobs: int + Number of processing threads used by multiprocessing. + Default None sets to the number of cores on your CPU (physical cores if you have psutil package installed, otherwise logical cores). + Set this to 1 to disable parallel processing (if its causing issues). Windows users may see a speed-up with n_jobs=1. + For :py:class:`TorchDataset` this is set to 1. + kwargs: Any + + Returns + ------- + + """ super().find_issues(**kwargs) assert params is not None assert imagelab_info is not None assert dataset is not None self.issue_types = list(params.keys()) - self.update_params(params) + self._update_params(params) to_compute = self._get_issue_types_to_compute(self.issue_types, imagelab_info) issue_type_hash_mapping: Dict[str, Any] = { diff --git a/src/cleanvision/issue_managers/image_property_issue_manager.py b/src/cleanvision/issue_managers/image_property_issue_manager.py index 3c98d5e8..1af74205 100644 --- a/src/cleanvision/issue_managers/image_property_issue_manager.py +++ b/src/cleanvision/issue_managers/image_property_issue_manager.py @@ -54,10 +54,10 @@ class ImagePropertyIssueManager(IssueManager): def __init__(self) -> None: super().__init__() self.issue_types: List[str] = [] - self.params = self.get_default_params() + self.params = self._get_default_params() self.image_properties = self._get_image_properties() - def get_default_params(self) -> Dict[str, Any]: + def _get_default_params(self) -> Dict[str, Any]: return { IssueType.DARK.value: {"threshold": 0.37}, IssueType.LIGHT.value: {"threshold": 0.05}, @@ -70,7 +70,7 @@ def get_default_params(self) -> Dict[str, Any]: IssueType.GRAYSCALE.value: {}, } - def update_params(self, params: Dict[str, Any]) -> None: + def _update_params(self, params: Dict[str, Any]) -> None: for issue_type in self.params: non_none_params = { k: v for k, v in params.get(issue_type, {}).items() if v is not None @@ -132,7 +132,7 @@ def find_issues( additional_set = self._get_additional_to_compute_set(self.issue_types) self.issue_types = self.issue_types + additional_set - self.update_params(params) + self._update_params(params) agg_computations = pd.DataFrame(index=dataset.index) agg_computations = self._add_prev_computations(agg_computations, imagelab_info) diff --git a/src/cleanvision/utils/base_issue_manager.py b/src/cleanvision/utils/base_issue_manager.py index 67a01c1e..1289a7bb 100644 --- a/src/cleanvision/utils/base_issue_manager.py +++ b/src/cleanvision/utils/base_issue_manager.py @@ -49,12 +49,12 @@ def find_issues(self, **kwargs: Any) -> None: return @abstractmethod - def get_default_params(self) -> Dict[str, Any]: + def _get_default_params(self) -> Dict[str, Any]: """Returns default params to be used by the issue_manager""" raise NotImplementedError @abstractmethod - def update_params(self, params: Dict[str, Any]) -> None: + def _update_params(self, params: Dict[str, Any]) -> None: """Sets params for an issue manager. Default params will be overridden by user provided params""" raise NotImplementedError diff --git a/tests/test_duplicate_issue_manager.py b/tests/test_duplicate_issue_manager.py index a6c5bbf7..9c184aca 100644 --- a/tests/test_duplicate_issue_manager.py +++ b/tests/test_duplicate_issue_manager.py @@ -47,7 +47,7 @@ def test_set_params(self, params, expected_params, issue_manager): 1. If no params are specified for an issue_type, default params are used 2. If params are specified, those specific params are updated, for the remaining ones default values are used """ - issue_manager.update_params(params) + issue_manager._update_params(params) assert issue_manager.params == expected_params @pytest.mark.parametrize( diff --git a/tests/test_image_property_issue_manager.py b/tests/test_image_property_issue_manager.py index 1f1a5faf..b7928281 100644 --- a/tests/test_image_property_issue_manager.py +++ b/tests/test_image_property_issue_manager.py @@ -62,7 +62,7 @@ def test_set_params(self, params, expected_params, issue_manager): issue_manager: instance of ImagePropertyIssueManager """ - issue_manager.update_params(params) + issue_manager._update_params(params) assert issue_manager.params == expected_params @pytest.mark.parametrize(